提交 9216da3f 编写于 作者: W wanghaox

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into develop

...@@ -21,11 +21,10 @@ third_party/ ...@@ -21,11 +21,10 @@ third_party/
cmake-build-* cmake-build-*
# generated while compiling # generated while compiling
python/paddle/v2/framework/core.so python/paddle/v2/fluid/core.so
paddle/pybind/pybind.h paddle/pybind/pybind.h
CMakeFiles CMakeFiles
cmake_install.cmake cmake_install.cmake
paddle/.timestamp paddle/.timestamp
python/paddlepaddle.egg-info/ python/paddlepaddle.egg-info/
paddle/pybind/pybind.h paddle/pybind/pybind.h
python/paddle/v2/framework/tests/tmp/*
...@@ -36,8 +36,7 @@ include(simd) ...@@ -36,8 +36,7 @@ include(simd)
################################ Configurations ####################################### ################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
...@@ -82,10 +81,8 @@ if(ANDROID OR IOS) ...@@ -82,10 +81,8 @@ if(ANDROID OR IOS)
"Disable PYTHON when cross-compiling for Android and iOS" FORCE) "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
set(WITH_RDMA OFF CACHE STRING set(WITH_RDMA OFF CACHE STRING
"Disable RDMA when cross-compiling for Android and iOS" FORCE) "Disable RDMA when cross-compiling for Android and iOS" FORCE)
set(WITH_MKLDNN OFF CACHE STRING set(WITH_MKL OFF CACHE STRING
"Disable MKLDNN when cross-compiling for Android and iOS" FORCE) "Disable MKL when cross-compiling for Android and iOS" FORCE)
set(WITH_MKLML OFF CACHE STRING
"Disable MKLML package when cross-compiling for Android and iOS" FORCE)
# Compile PaddlePaddle mobile inference library # Compile PaddlePaddle mobile inference library
if (NOT WITH_C_API) if (NOT WITH_C_API)
...@@ -111,6 +108,14 @@ else() ...@@ -111,6 +108,14 @@ else()
set(THIRD_PARTY_BUILD_TYPE Release) set(THIRD_PARTY_BUILD_TYPE Release)
endif() endif()
set(WITH_MKLML ${WITH_MKL})
if (WITH_MKL AND AVX2_FOUND)
set(WITH_MKLDNN ON)
else()
message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
set(WITH_MKLDNN OFF)
endif()
######################################################################################## ########################################################################################
include(external/mklml) # download mklml package include(external/mklml) # download mklml package
...@@ -158,14 +163,15 @@ set(EXTERNAL_LIBS ...@@ -158,14 +163,15 @@ set(EXTERNAL_LIBS
) )
if(WITH_GPU) if(WITH_GPU)
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) include(cuda)
if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
endif(NOT WITH_DSO)
endif(WITH_GPU) endif(WITH_GPU)
if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif()
if(WITH_MKLDNN) if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB}) list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif() endif()
if(USE_NNPACK) if(USE_NNPACK)
......
...@@ -5,6 +5,7 @@ height = 224 ...@@ -5,6 +5,7 @@ height = 224
width = 224 width = 224
num_class = 1000 num_class = 1000
batch_size = get_config_arg('batch_size', int, 128) batch_size = get_config_arg('batch_size', int, 128)
use_gpu = get_config_arg('use_gpu', bool, True)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
define_py_data_sources2( define_py_data_sources2(
...@@ -16,6 +17,8 @@ settings( ...@@ -16,6 +17,8 @@ settings(
learning_method=MomentumOptimizer(0.9), learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size)) regularization=L2Regularization(0.0005 * batch_size))
conv_projection = conv_projection if use_gpu else img_conv_layer
def inception2(name, input, channels, \ def inception2(name, input, channels, \
filter1, filter1,
filter3R, filter3, filter3R, filter3,
...@@ -138,7 +141,7 @@ def inception(name, input, channels, \ ...@@ -138,7 +141,7 @@ def inception(name, input, channels, \
cat = concat_layer( cat = concat_layer(
name=name, name=name,
input=[cov1, cov3, cov5, covprj], input=[cov1, cov3, cov5, covprj],
bias_attr=True, bias_attr=True if use_gpu else False,
act=ReluActivation()) act=ReluActivation())
return cat return cat
......
set -e set -e
function train() { function train() {
unset OMP_NUM_THREADS MKL_NUM_THREADS unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
export OMP_DYNAMIC="FALSE"
export KMP_AFFINITY="granularity=fine,compact,0,0"
topology=$1 topology=$1
layer_num=$2 layer_num=$2
bs=$3 bs=$3
...@@ -14,8 +12,6 @@ function train() { ...@@ -14,8 +12,6 @@ function train() {
elif [ $4 == "False" ]; then elif [ $4 == "False" ]; then
thread=`nproc` thread=`nproc`
# each trainer_count use only 1 core to avoid conflict # each trainer_count use only 1 core to avoid conflict
export OMP_NUM_THREADS=1
export MKL_NUM_THREADS=1
log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log" log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
else else
echo "Wrong input $3, use True or False." echo "Wrong input $3, use True or False."
...@@ -44,6 +40,7 @@ fi ...@@ -44,6 +40,7 @@ fi
for use_mkldnn in True False; do for use_mkldnn in True False; do
for batchsize in 64 128 256; do for batchsize in 64 128 256; do
train vgg 19 $batchsize $use_mkldnn train vgg 19 $batchsize $use_mkldnn
train resnet 50 $batchsize $use_mkldnn train resnet 50 $batchsize $use_mkldnn
train googlenet v1 $batchsize $use_mkldnn
done done
done done
...@@ -76,27 +76,14 @@ else() ...@@ -76,27 +76,14 @@ else()
include_directories(${CUDA_TOOLKIT_INCLUDE}) include_directories(${CUDA_TOOLKIT_INCLUDE})
endif(NOT WITH_GPU) endif(NOT WITH_GPU)
if(WITH_MKLDNN) if (WITH_MKLML AND MKLML_IOMP_LIB)
add_definitions(-DPADDLE_USE_MKLDNN) message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
if (WITH_MKLML AND MKLDNN_IOMP_DIR) set(OPENMP_FLAGS "-fopenmp")
message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}") set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(OPENMP_FLAGS "-fopenmp") set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
else()
find_package(OpenMP)
if(OPENMP_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
else()
message(WARNING "Can not find OpenMP."
"Some performance features in MKLDNN may not be available")
endif()
endif()
endif(WITH_MKLDNN)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
......
...@@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform") ...@@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
# Set the architecture for iOS # Set the architecture for iOS
if(NOT DEFINED IOS_ARCH) if(NOT DEFINED IOS_ARCH)
if(IOS_PLATFORM STREQUAL "OS") if(IOS_PLATFORM STREQUAL "OS")
# FIXME(liuyiqun): support "armv7;armv7s;arm64" future set(IOS_ARCH "armv7;armv7s;arm64")
set(IOS_ARCH "arm64")
elseif(IOS_PLATFORM STREQUAL "SIMULATOR") elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
# FIXME(liuyiqun): support "i386;x86_64" future set(IOS_ARCH "i386;x86_64")
set(IOS_ARCH "x86_64")
endif() endif()
endif() endif()
set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
...@@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_ ...@@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_
# Hidden visibilty is required for cxx on iOS # Hidden visibilty is required for cxx on iOS
set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags") set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags") set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first") set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
......
if(NOT WITH_GPU)
return()
endif()
set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs7 "30 35 50 52")
set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
######################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled)
# Usage:
# detect_installed_gpus(out_variable)
function(detect_installed_gpus out_variable)
if(NOT CUDA_gpu_detect_output)
set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
file(WRITE ${cufile} ""
"#include <cstdio>\n"
"int main() {\n"
" int count = 0;\n"
" if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
" if (count == 0) return -1;\n"
" for (int device = 0; device < count; ++device) {\n"
" cudaDeviceProp prop;\n"
" if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
" std::printf(\"%d.%d \", prop.major, prop.minor);\n"
" }\n"
" return 0;\n"
"}\n")
execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
"--run" "${cufile}"
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(nvcc_res EQUAL 0)
# only keep the last line of nvcc_out
STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
list(GET nvcc_out -1 nvcc_out)
string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
endif()
endif()
if(NOT CUDA_gpu_detect_output)
message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
else()
set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
endif()
endfunction()
########################################################################
# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
# Usage:
# select_nvcc_arch_flags(out_variable)
function(select_nvcc_arch_flags out_variable)
# List of arch names
set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
set(archs_name_default "All")
if(NOT CMAKE_CROSSCOMPILING)
list(APPEND archs_names "Auto")
endif()
# set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
mark_as_advanced(CUDA_ARCH_NAME)
# verify CUDA_ARCH_NAME value
if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
string(REPLACE ";" ", " archs_names "${archs_names}")
message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
endif()
if(${CUDA_ARCH_NAME} STREQUAL "Manual")
set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
else()
unset(CUDA_ARCH_BIN CACHE)
unset(CUDA_ARCH_PTX CACHE)
endif()
if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
set(cuda_arch_bin "30 35")
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
set(cuda_arch_bin "50")
elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
set(cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
set(cuda_arch_bin "70")
elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs})
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
detect_installed_gpus(cuda_arch_bin)
else() # (${CUDA_ARCH_NAME} STREQUAL "Manual")
set(cuda_arch_bin ${CUDA_ARCH_BIN})
endif()
# remove dots and convert to lists
string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}")
list(REMOVE_DUPLICATES cuda_arch_bin)
list(REMOVE_DUPLICATES cuda_arch_ptx)
set(nvcc_flags "")
set(nvcc_archs_readable "")
# Tell NVCC to add binaries for the specified GPUs
foreach(arch ${cuda_arch_bin})
if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
# User explicitly specified PTX for the concrete BIN
list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
else()
# User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
list(APPEND nvcc_archs_readable sm_${arch})
endif()
endforeach()
# Tell NVCC to add PTX intermediate code for the specified architectures
foreach(arch ${cuda_arch_ptx})
list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
list(APPEND nvcc_archs_readable compute_${arch})
endforeach()
string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
set(${out_variable} ${nvcc_flags} PARENT_SCOPE)
set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
endfunction()
message(STATUS "CUDA detected: " ${CUDA_VERSION})
if (${CUDA_VERSION} LESS 7.0)
set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
# CUDA 8 may complain that sm_20 is no longer supported. Suppress the
# warning for now.
list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
endif()
include_directories(${CUDA_INCLUDE_DIRS})
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
endif(NOT WITH_DSO)
# setting nvcc arch flags
select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
# Set C++11 support
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
# Set :expt-relaxed-constexpr to suppress Eigen warnings
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
endif()
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
...@@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) ...@@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
IF(${CBLAS_PROVIDER} STREQUAL "MKLML") IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) SET(MKLDNN_DEPENDS ${MKLML_PROJECT})
SET(MKLDNN_MKLROOT ${MKLML_ROOT}) MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB}) ELSE()
SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR}) MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
ENDIF() ENDIF()
SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
...@@ -57,15 +56,16 @@ ExternalProject_Add( ...@@ -57,15 +56,16 @@ ExternalProject_Add(
PREFIX ${MKLDNN_SOURCES_DIR} PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT} CMAKE_ARGS -DMKLROOT=${MKLML_ROOT}
CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-DMKLROOT:PATH=${MKLDNN_MKLROOT} -DMKLROOT:PATH=${MKLML_ROOT}
) )
ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL) ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}") MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
add_definitions(-DPADDLE_USE_MKLDNN)
LIST(APPEND external_project_dependencies mkldnn) LIST(APPEND external_project_dependencies mkldnn)
...@@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND})
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE) CACHE FILEPATH "openblas library." FORCE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER}") SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
IF(CMAKE_CROSSCOMPILING) IF(CMAKE_CROSSCOMPILING)
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER}) SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
...@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND})
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
ENDIF() ENDIF()
ELSEIF(IOS) ELSEIF(IOS)
# FIXME(liuyiqun): support multiple architectures IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64") SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX}) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
ELSE()
MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
"You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
ENDIF() ENDIF()
ELSEIF(RPI) ELSEIF(RPI)
# use hardfp # use hardfp
...@@ -98,7 +97,7 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -98,7 +97,7 @@ IF(NOT ${CBLAS_FOUND})
ENDIF() ENDIF()
INSTALL(CODE "execute_process( INSTALL(CODE "execute_process(
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib
destination ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR} ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}
)" )"
) )
INSTALL(CODE "MESSAGE(STATUS \"Installing: \" INSTALL(CODE "MESSAGE(STATUS \"Installing: \"
......
...@@ -12,6 +12,10 @@ ...@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
IF(MOBILE_INFERENCE)
return()
ENDIF()
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc) SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
......
...@@ -149,58 +149,3 @@ endforeach() ...@@ -149,58 +149,3 @@ endforeach()
foreach(flag ${GPU_COMMON_FLAGS}) foreach(flag ${GPU_COMMON_FLAGS})
safe_set_nvflag(${flag}) safe_set_nvflag(${flag})
endforeach() endforeach()
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
endif()
function(specify_cuda_arch cuda_version cuda_arch)
if(${cuda_version} VERSION_GREATER "8.0")
foreach(capability 61 62)
if(${cuda_arch} STREQUAL ${capability})
list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
endif()
endforeach()
elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
endif()
endfunction()
# Common gpu architectures: Kepler, Maxwell
foreach(capability 30 35 50)
list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
endforeach()
if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
endif()
# Modern gpu architectures: Pascal
if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
endif()
# Custom gpu architecture
set(CUDA_ARCH)
if(CUDA_ARCH)
specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
endif()
set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
...@@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME) ...@@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME)
target_link_libraries(${TARGET_NAME} log) target_link_libraries(${TARGET_NAME} log)
endif(ANDROID) endif(ANDROID)
if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR) if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed") target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
endif() endif()
add_dependencies(${TARGET_NAME} ${external_project_dependencies}) add_dependencies(${TARGET_NAME} ${external_project_dependencies})
......
...@@ -335,6 +335,16 @@ bilinear_interp ...@@ -335,6 +335,16 @@ bilinear_interp
.. autoclass:: paddle.v2.layer.bilinear_interp .. autoclass:: paddle.v2.layer.bilinear_interp
:noindex: :noindex:
dot_prod
---------
.. autoclass:: paddle.v2.layer.dot_prod
:noindex:
out_prod
--------
.. autoclass:: paddle.v2.layer.out_prod
:noindex:
power power
----- -----
.. autoclass:: paddle.v2.layer.power .. autoclass:: paddle.v2.layer.power
...@@ -372,6 +382,11 @@ cos_sim ...@@ -372,6 +382,11 @@ cos_sim
.. autoclass:: paddle.v2.layer.cos_sim .. autoclass:: paddle.v2.layer.cos_sim
:noindex: :noindex:
l2_distance
-----------
.. autoclass:: paddle.v2.layer.l2_distance
:noindex:
trans trans
----- -----
.. autoclass:: paddle.v2.layer.trans .. autoclass:: paddle.v2.layer.trans
......
## Evaluator Design
### The Problem
During training or serving, we provide the evaluation function to measure the model performance, e.g., accuracy, precision. In the operator based framework design, the data go through the network pipeline batch by batch. As a result, inside the operator, we only can calculate one minibatch metrics. We need to provide a mechanism to calculate the metrics for each N pass/batch the user wanted.
### Evaluator Design
Currently, every operation is expressed in the graph. we divide the evaluator process into three steps.
1. Initialize the metric state and add it into the block.
2. Calculate the statistic of the metric state in every mini-batch. The single operator is only responsible for calculating necessary statistics for one mini-batch. For example, accuracy operator only calculate a minibatch data if run once.
3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
### Implementation
This design is shown in python API.
Each metric operator need to caculate the metric statistic and return the batch aware states, Python side responsible for accumulate the states for each pass.
```python
class Evaluator(object):
"""
Evaluator Base class.
"""
def __init__(self, name, **kwargs):
"""
Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts.
Auc need four variables, `true_positives`,
`true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program
The initialization of Evaluator should be responsible for:
create metric states and append to the main_program
"""
pass
def _update_ops(self, input, label, **kwargs)
"""
Add mini-batch evaluator caculate operators to the main_program.
Add increment operator to accumulate the metric states.
"""
def reset(self, executor, reset_program=None):
"""
Reset metric states at the begin of each pass/user specified batch number.
Execute the reset_program to reset the states.
"""
def eval(self, executor, eval_program=None):
"""
Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
Execute the eval_program and return the result.
"""
return eval_result
```
...@@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA. ...@@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA.
我们把集成方案大致分为了如下几个方面。 我们把集成方案大致分为了如下几个方面。
### CMake ### CMake
我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项,当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能 我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关,他是负责`WITH_MKLML``WITH_MKLDNN`的总开关
同时,我们会引入`WITH_MKLML`选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。 当打开`WITH_MKL`时,会开启MKLML的功能,作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上,同时会开启MKL-DNN功能。
所以,我们会在`cmake/external`目录新建`mkldnn.cmake``mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中 当关闭`WITH_MKL`时,MKLML和MKL-DNN功能会同时关闭
**备注**:当`WITH_MKLML=ON`的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动`cmake/cblas.cmake`中的逻辑 所以,我们会在`cmake/external`目录新建`mkldnn.cmake``mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中
### Layers ### Layers
所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在 所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在
......
digraph G { digraph G {
rnn [label="1-th level RNN" shape=box] rnn [label="1st level RNN" shape=box]
subgraph cluster0 { subgraph cluster0 {
label = "time step 0" label = "time step 0"
...@@ -8,7 +8,7 @@ digraph G { ...@@ -8,7 +8,7 @@ digraph G {
sent0 [label="sentence"] sent0 [label="sentence"]
sent1 [label="sentence"] sent1 [label="sentence"]
rnn1 [label="2-th level RNN" shape=box] rnn1 [label="2nd level RNN" shape=box]
sent0 -> rnn1 sent0 -> rnn1
sent1 -> rnn1 sent1 -> rnn1
...@@ -20,7 +20,7 @@ digraph G { ...@@ -20,7 +20,7 @@ digraph G {
sent2 [label="sentence"] sent2 [label="sentence"]
sent3 [label="sentence"] sent3 [label="sentence"]
rnn2 [label="2-th level RNN" shape=box] rnn2 [label="2nd level RNN" shape=box]
sent2 -> rnn2 sent2 -> rnn2
sent3 -> rnn2 sent3 -> rnn2
...@@ -32,7 +32,7 @@ digraph G { ...@@ -32,7 +32,7 @@ digraph G {
sent4 [label="sentence"] sent4 [label="sentence"]
sent5 [label="sentence"] sent5 [label="sentence"]
rnn3 [label="2-th level RNN" shape=box] rnn3 [label="2nd level RNN" shape=box]
sent4 -> rnn3 sent4 -> rnn3
sent5 -> rnn3 sent5 -> rnn3
......
# RNNOp design # RNNOp design
This document is about an RNN operator which requires that instances in a mini-batch have the same length. We will have a more flexible RNN operator. This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future.
## RNN Algorithm Implementation ## RNN Algorithm Implementation
<p aligh="center"> <p align="center">
<img src="./images/rnn.jpg"/> <img src="./images/rnn.jpg"/>
</p> </p>
The above diagram shows an RNN unrolled into a full network. The above diagram shows an RNN unrolled into a full network.
There are several important concepts: There are several important concepts here:
- *step-net*: the sub-graph to run at each step, - *step-net*: the sub-graph that runs at each step.
- *memory*, $h_t$, the state of the current step, - *memory*, $h_t$, the state of the current step.
- *ex-memory*, $h_{t-1}$, the state of the previous step, - *ex-memory*, $h_{t-1}$, the state of the previous step.
- *initial memory value*, the ex-memory of the first step. - *initial memory value*, the memory of the first (initial) step.
### Step-scope ### Step-scope
There could be local variables defined in step-nets. PaddlePaddle runtime realizes these variables in *step-scopes* -- scopes created for each step. There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
<p aligh="center"> <p align="center">
<img src="./images/rnn.png"/><br/> <img src="./images/rnn.png"/><br/>
Figure 2 the RNN's data flow Figure 2 illustrates the RNN's data flow
</p> </p>
Please be aware that all steps run the same step-net. Each step Please be aware that every step runs the same step-net. Each step does the following:
1. creates the step-scope, 1. Creates the step-scope.
2. realizes local variables, including step-outputs, in the step-scope, and 2. Initializes the local variables including step-outputs, in the step-scope.
3. runs the step-net, which could use these variables. 3. Runs the step-net, which uses the above mentioned variables.
The RNN operator will compose its output from step outputs in step scopes. The RNN operator will compose its output from step outputs in each of the step scopes.
### Memory and Ex-memory ### Memory and Ex-memory
Let's give more details about memory and ex-memory via a simply example: Let's give more details about memory and ex-memory using a simple example:
$$ $$
h_t = U h_{t-1} + W x_t h_t = U h_{t-1} + W x_t
$$, $$,
where $h_t$ and $h_{t-1}$ are the memory and ex-memory of step $t$'s respectively. where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively.
In the implementation, we can make an ex-memory variable either "refers to" the memory variable of the previous step, In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step,
or copy the value of the previous memory value to the current ex-memory variable. or copy the memory value of the previous step to the current ex-memory variable.
### Usage in Python ### Usage in Python
For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
We can define an RNN's step-net using Block: We can define an RNN's step-net using a Block:
```python ```python
import paddle as pd import paddle as pd
X = some_op() # x is some operator's output, and is a LoDTensor X = some_op() # x is some operator's output and is a LoDTensor
a = some_op() a = some_op()
# declare parameters # declare parameters
...@@ -68,7 +68,7 @@ with rnn.stepnet(): ...@@ -68,7 +68,7 @@ with rnn.stepnet():
x = rnn.add_input(X) x = rnn.add_input(X)
# declare a memory (rnn's step) # declare a memory (rnn's step)
h = rnn.add_memory(init=a) h = rnn.add_memory(init=a)
# h.pre_state() means previous memory of rnn # h.pre_state(), the previous memory of rnn
new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state())) new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
# update current memory # update current memory
h.update(new_state) h.update(new_state)
...@@ -80,19 +80,19 @@ out = rnn() ...@@ -80,19 +80,19 @@ out = rnn()
Python API functions in above example: Python API functions in above example:
- `rnn.add_input` indicates the parameter is a variable that will be segmented into step-inputs. - `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs.
- `rnn.add_memory` creates a variable used as the memory. - `rnn.add_memory`: creates a variable used as the memory.
- `rnn.add_outputs` mark the variables that will be concatenated across steps into the RNN output. - `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output.
### Nested RNN and LoDTensor ### Nested RNN and LoDTensor
An RNN whose step-net includes other RNN operators is known as an *nested RNN*. An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level.
The following figure illustrates the feeding of text into the lower level, one sentence each step, and the feeding of step outputs to the top level. The final top level output is about the whole text. The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
<p aligh="center"> <p align="center">
<img src="./images/2_level_rnn.png"/> <img src="./images/2_level_rnn.png"/>
</p> </p>
...@@ -110,7 +110,7 @@ a = some_op() ...@@ -110,7 +110,7 @@ a = some_op()
# chapter_data is a set of 128-dim word vectors # chapter_data is a set of 128-dim word vectors
# the first level of LoD is sentence # the first level of LoD is sentence
# the second level of LoD is chapter # the second level of LoD is a chapter
chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2) chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
def lower_level_rnn(paragraph): def lower_level_rnn(paragraph):
...@@ -138,14 +138,14 @@ with top_level_rnn.stepnet(): ...@@ -138,14 +138,14 @@ with top_level_rnn.stepnet():
pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state())) pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
top_level_rnn.add_outputs(h) top_level_rnn.add_outputs(h)
# just output the last step # output the last step
chapter_out = top_level_rnn(output_all_steps=False) chapter_out = top_level_rnn(output_all_steps=False)
``` ```
in above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is a LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences. In the above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
By default, the `RNNOp` will concatenate the outputs from all the time steps, By default, the `RNNOp` will concatenate the outputs from all the time steps.
if the `output_all_steps` set to False, it will only output the final time step. If the `output_all_steps` is set to False, it will only output the final time step.
<p align="center"> <p align="center">
......
# Design: Sequence Decoder Generating LoDTensors # Design: Sequence Decoder Generating LoDTensors
In tasks such as machine translation and image to text, In tasks such as machine translation and visual captioning,
a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences. a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time.
This documentation describes how to implement the sequence decoder as an operator. This documentation describes how to implement the sequence decoder as an operator.
## Beam Search based Decoder ## Beam Search based Decoder
The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences, The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
it is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
In the old version of PaddlePaddle, a C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users.
due to the complexity, the implementation relays on a lot of special data structures,
quite trivial and hard to be customized by users.
There are a lot of heuristic tricks in the sequence generation tasks, There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
so the flexibility of sequence decoder is very important to users.
During PaddlePaddle's refactoring work, During the refactoring of PaddlePaddle, some new concepts are proposed such as: [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
some new concept is proposed such as [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support sequence usage,
and they can help to make the implementation of beam search based sequence decoder **more transparent and modular** .
For example, the RNN sates, candidates IDs and probabilities of beam search can be represented as `LoDTensors`; For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated. the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
## Changing LoD's absolute offset to relative offsets ## Changing LoD's absolute offset to relative offsets
The current `LoDTensor` is designed to store levels of variable-length sequences, The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level.
it stores several arrays of integers each represents a level.
The integers in each level represents the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
let's call this format the **absolute-offset LoD** for clear. let's call this format the **absolute-offset LoD** for clarity.
The relative-offset LoD can fast retrieve any sequence but fails to represent empty sequences, for example, a two-level LoD is as follows The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
```python ```python
[[0, 3, 9] [[0, 3, 9]
[0, 2, 3, 3, 3, 9]] [0, 2, 3, 3, 3, 9]]
...@@ -41,10 +34,9 @@ The first level tells that there are two sequences: ...@@ -41,10 +34,9 @@ The first level tells that there are two sequences:
while on the second level, there are several empty sequences that both begin and end at `3`. while on the second level, there are several empty sequences that both begin and end at `3`.
It is impossible to tell how many empty second-level sequences exist in the first-level sequences. It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
There are many scenarios that relay on empty sequence representation, There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix.
such as machine translation or image to text, one instance has no translations or the empty candidate set for a prefix.
So let's introduce another format of LoD, So let's introduce another format of LoD,
it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD. it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
For example, to represent the same sequences of the above data For example, to represent the same sequences of the above data
...@@ -54,19 +46,18 @@ For example, to represent the same sequences of the above data ...@@ -54,19 +46,18 @@ For example, to represent the same sequences of the above data
[0, 2, 3, 3, 3, 9]] [0, 2, 3, 3, 3, 9]]
``` ```
the first level represents that there are two sequences, the first level represents that there are two sequences,
their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`. their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
The second level is the same with the relative offset example because the lower level is a tensor. The second level is the same with the relative offset example because the lower level is a tensor.
It is easy to find out the second sequence in the first-level LoD has two empty sequences. It is easy to find out the second sequence in the first-level LoD has two empty sequences.
The following demos are based on relative-offset LoD. The following examples are based on relative-offset LoD.
## Usage in a simple machine translation model ## Usage in a simple machine translation model
Let's start from a simple machine translation model that is simplified from [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a simple blueprint of what a sequence decoder can do and how to use it. Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it.
The model has an encoder that learns the semantic vector from a sequence, The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences.
and a decoder which uses the sequence decoder to generate new sentences.
**Encoder** **Encoder**
```python ```python
...@@ -117,7 +108,7 @@ def generate(): ...@@ -117,7 +108,7 @@ def generate():
# which means there are 2 sentences to translate # which means there are 2 sentences to translate
# - the first sentence has 1 translation prefixes, the offsets are [0, 1) # - the first sentence has 1 translation prefixes, the offsets are [0, 1)
# - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6) # - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
# the target_word.lod is # the target_word.lod is
# [[0, 1, 6] # [[0, 1, 6]
# [0, 2, 4, 7, 9 12]] # [0, 2, 4, 7, 9 12]]
# which means 2 sentences to translate, each has 1 and 5 prefixes # which means 2 sentences to translate, each has 1 and 5 prefixes
...@@ -154,37 +145,36 @@ def generate(): ...@@ -154,37 +145,36 @@ def generate():
translation_ids, translation_scores = decoder() translation_ids, translation_scores = decoder()
``` ```
The `decoder.beam_search` is a operator that given the candidates and the scores of translations including the candidates, The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates,
return the result of the beam search algorithm. returns the result of the beam search algorithm.
In this way, users can customize anything on the inputs or outputs of beam search, for example, two ways to prune some translation prefixes In this way, users can customize anything on the input or output of beam search, for example:
1. meke the correspondind elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate. 1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
2. remove some specific candidate in `selected_ids` 2. Remove some specific candidate in `selected_ids`.
3. get the final `translation_ids`, remove the translation sequence in it. 3. Get the final `translation_ids`, remove the translation sequence in it.
The implementation of sequence decoder can reuse the C++ class [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30), The implementation of sequence decoder can reuse the C++ class: [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
so the python syntax is quite similar to a [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop). so the python syntax is quite similar to that of an [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
Both of them are two-level `LoDTensors` Both of them are two-level `LoDTensors`:
- the first level represents `batch_size` of (source) sentences; - The first level represents `batch_size` of (source) sentences.
- the second level represents the candidate ID sets for translation prefix. - The second level represents the candidate ID sets for translation prefix.
for example, 3 source sentences to translate, and has 2, 3, 1 candidates. For example, 3 source sentences to translate, and has 2, 3, 1 candidates.
Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
a `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
For example, the previous state For example, the previous state:
* LoD is `[0, 1, 3][0, 2, 5, 6]` * LoD is `[0, 1, 3][0, 2, 5, 6]`
* content of tensor is `a1 a2 b1 b2 b3 c1` * content of tensor is `a1 a2 b1 b2 b3 c1`
the current state stored in `encoder_ctx_expanded` the current state is stored in `encoder_ctx_expanded`:
* LoD is `[0, 2, 7][0 3 5 8 9 11 11]` * LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
* the content is * the content is
- a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates) - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
- a2 a2 - a2 a2
- b1 b1 b1 - b1 b1 b1
...@@ -192,54 +182,48 @@ the current state stored in `encoder_ctx_expanded` ...@@ -192,54 +182,48 @@ the current state stored in `encoder_ctx_expanded`
- b3 b3 - b3 b3
- None (c1 has 0 candidates, so c1 is dropped) - None (c1 has 0 candidates, so c1 is dropped)
Benefit from the relative offset LoD, empty candidate set can be represented naturally. The benefit from the relative offset LoD is that the empty candidate set can be represented naturally.
the status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor, the corresponding syntax is The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is:
```python ```python
decoder.output(selected_ids) decoder.output(selected_ids)
decoder.output(selected_generation_scores) decoder.output(selected_generation_scores)
``` ```
the `selected_ids` is the candidate ids for the prefixes, The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences.
it will be `Packed` by `TensorArray` to a two-level `LoDTensor`,
the first level represents the source sequences,
the second level represents generated sequences.
Pack the `selected_scores` will get a `LoDTensor` that stores scores of each candidate of translations. Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate.
Pack the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation. Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
## LoD and shape changes during decoding ## LoD and shape changes during decoding
<p align="center"> <p align="center">
<img src="./images/LOD-and-shape-changes-during-decoding.jpg"/> <img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
</p> </p>
According the image above, the only phrase to change LoD is beam search. According to the image above, the only phase that changes the LoD is beam search.
## Beam search design ## Beam search design
The beam search algorthm will be implemented as one method of the sequence decoder, it has 3 inputs The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs:
1. `topk_ids`, top K candidate ids for each prefix. 1. `topk_ids`, the top K candidate ids for each prefix.
2. `topk_scores`, the corresponding scores for `topk_ids` 2. `topk_scores`, the corresponding scores for `topk_ids`
3. `generated_scores`, the score of the prefixes. 3. `generated_scores`, the score of the prefixes.
All of the are LoDTensors, so that the sequence affilication is clear. All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
It will return three variables It will return three variables:
1. `selected_ids`, the final candidate beam search function selected for the next step. 1. `selected_ids`, the final candidate beam search function selected for the next step.
2. `selected_scores`, the scores for the candidates. 2. `selected_scores`, the scores for the candidates.
3. `generated_scores`, the updated scores for each prefixes (with the new candidates appended). 3. `generated_scores`, the updated scores for each prefix (with the new candidates appended).
## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray` ## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors, The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step,
and they exist in each time step,
so it is natural to store them in arrays. so it is natural to store them in arrays.
Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors, Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`.
the results of beam search are better to store in a `TensorArray`.
The `Pack` and `UnPack` in `TensorArray` are used to package tensors in the array to a `LoDTensor` or split the `LoDTensor` to an array of tensors. The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors.
It needs some extensions to support pack or unpack an array of `LoDTensors`. It needs some extensions to support the packing or unpacking an array of `LoDTensors`.
...@@ -34,7 +34,7 @@ PaddlePaddle的文档构建有两种方式。 ...@@ -34,7 +34,7 @@ PaddlePaddle的文档构建有两种方式。
cd TO_YOUR_PADDLE_CLONE_PATH cd TO_YOUR_PADDLE_CLONE_PATH
mkdir -p build mkdir -p build
cd build cd build
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
make gen_proto_py make gen_proto_py
make paddle_docs paddle_docs_cn make paddle_docs paddle_docs_cn
......
# 构建Android平台上的PaddlePaddle库 # Android平台编译指南
用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库: 用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库:
- 基于Docker容器的编译方式 - 基于Docker容器的编译方式
......
# 构建iOS平台上的PaddlePaddle库 # iOS平台编译指南
交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。 交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。
## 准备交叉编译环境 ## 准备交叉编译环境
...@@ -25,7 +25,7 @@ iOS平台可选配置参数: ...@@ -25,7 +25,7 @@ iOS平台可选配置参数:
- `IOS_PLATFORM`,可设置为`OS/SIMULATOR`,默认值为`OS` - `IOS_PLATFORM`,可设置为`OS/SIMULATOR`,默认值为`OS`
- `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。 - `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。
- `SIMULATOR`,构建目标为`x86`架构的模拟器平台。 - `SIMULATOR`,构建目标为`x86`架构的模拟器平台。
- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示: - `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示,默认编译所有架构
<table class="docutils"> <table class="docutils">
<colgroup> <colgroup>
...@@ -41,11 +41,11 @@ iOS平台可选配置参数: ...@@ -41,11 +41,11 @@ iOS平台可选配置参数:
<tbody valign="top"> <tbody valign="top">
<tr class="row-even"> <tr class="row-even">
<td>OS</td> <td>OS</td>
<td>armv7, armv7s, arm64 (默认)</td> <td>armv7, armv7s, arm64 </td>
</tr> </tr>
<tr class="row-odd"> <tr class="row-odd">
<td>SIMULATOR</td> <td>SIMULATOR</td>
<td>i386, x86_64 (默认)</td> <td>i386, x86_64 </td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
...@@ -66,7 +66,7 @@ iOS平台可选配置参数: ...@@ -66,7 +66,7 @@ iOS平台可选配置参数:
```bash ```bash
cmake -DCMAKE_SYSTEM_NAME=iOS \ cmake -DCMAKE_SYSTEM_NAME=iOS \
-DIOS_PLATFORM=OS \ -DIOS_PLATFORM=OS \
-DIOS_ARCH="arm64" \ -DIOS_ARCH="armv7;arm64" \
-DIOS_ENABLE_BITCODE=ON \ -DIOS_ENABLE_BITCODE=ON \
-DIOS_USE_VECLIB_FOR_BLAS=ON \ -DIOS_USE_VECLIB_FOR_BLAS=ON \
-DCMAKE_INSTALL_PREFIX=your/path/to/install \ -DCMAKE_INSTALL_PREFIX=your/path/to/install \
...@@ -112,6 +112,6 @@ $ make install ...@@ -112,6 +112,6 @@ $ make install
- `lib`目录,其中包含PaddlePaddle的C-API静态库 - `lib`目录,其中包含PaddlePaddle的C-API静态库
- `third_party`目录,其中包含所依赖的所有第三方库 - `third_party`目录,其中包含所依赖的所有第三方库
注意,不同架构的PaddlePaddle库建议安装到不同的目录下,然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。 注意,如果PaddlePaddle库需要同时支持真机和模拟器,则需要分别编译真机和模拟器版本,然后使用`lipo`工具合并fat库。
自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。 自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。
# 构建Raspberry Pi平台上的PaddlePaddle库 # Raspberry Pi平台编译指南
通常有两个方法来构建基于 Rasspberry Pi 的版本: 通常有两个方法来构建基于 Rasspberry Pi 的版本:
......
...@@ -29,6 +29,9 @@ static void initPaddle(int argc, char** argv) { ...@@ -29,6 +29,9 @@ static void initPaddle(int argc, char** argv) {
extern "C" { extern "C" {
paddle_error paddle_init(int argc, char** argv) { paddle_error paddle_init(int argc, char** argv) {
static bool isInit = false;
if (isInit) return kPD_NO_ERROR;
std::vector<char*> realArgv; std::vector<char*> realArgv;
realArgv.reserve(argc + 1); realArgv.reserve(argc + 1);
realArgv.push_back(strdup("")); realArgv.push_back(strdup(""));
...@@ -37,6 +40,7 @@ paddle_error paddle_init(int argc, char** argv) { ...@@ -37,6 +40,7 @@ paddle_error paddle_init(int argc, char** argv) {
} }
initPaddle(argc + 1, realArgv.data()); initPaddle(argc + 1, realArgv.data());
free(realArgv[0]); free(realArgv[0]);
isInit = true;
return kPD_NO_ERROR; return kPD_NO_ERROR;
} }
} }
...@@ -121,6 +121,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix mat, ...@@ -121,6 +121,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix mat,
paddle_matrix paddle_matrix_create_sparse( paddle_matrix paddle_matrix_create_sparse(
uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) { uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
#ifndef PADDLE_MOBILE_INFERENCE
auto ptr = new paddle::capi::CMatrix(); auto ptr = new paddle::capi::CMatrix();
ptr->mat = paddle::Matrix::createSparseMatrix( ptr->mat = paddle::Matrix::createSparseMatrix(
height, height,
...@@ -131,6 +132,9 @@ paddle_matrix paddle_matrix_create_sparse( ...@@ -131,6 +132,9 @@ paddle_matrix paddle_matrix_create_sparse(
false, false,
useGpu); useGpu);
return ptr; return ptr;
#else
return nullptr;
#endif
} }
paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat, paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
...@@ -140,6 +144,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat, ...@@ -140,6 +144,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
uint64_t colSize, uint64_t colSize,
float* valueArray, float* valueArray,
uint64_t valueSize) { uint64_t valueSize) {
#ifndef PADDLE_MOBILE_INFERENCE
if (mat == nullptr) return kPD_NULLPTR; if (mat == nullptr) return kPD_NULLPTR;
auto ptr = cast(mat); auto ptr = cast(mat);
if (rowArray == nullptr || colArray == nullptr || if (rowArray == nullptr || colArray == nullptr ||
...@@ -160,4 +165,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat, ...@@ -160,4 +165,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
} else { } else {
return kPD_NOT_SUPPORTED; return kPD_NOT_SUPPORTED;
} }
#else
return kPD_NOT_SUPPORTED;
#endif
} }
...@@ -48,6 +48,7 @@ PD_API paddle_matrix paddle_matrix_create(uint64_t height, ...@@ -48,6 +48,7 @@ PD_API paddle_matrix paddle_matrix_create(uint64_t height,
* @param isBinary is binary (either 1 or 0 in matrix) or not. * @param isBinary is binary (either 1 or 0 in matrix) or not.
* @param useGpu is using GPU or not. * @param useGpu is using GPU or not.
* @return paddle_matrix. * @return paddle_matrix.
* @note Mobile inference does not support this interface.
*/ */
PD_API paddle_matrix paddle_matrix_create_sparse( PD_API paddle_matrix paddle_matrix_create_sparse(
uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu); uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
...@@ -129,6 +130,7 @@ PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat, ...@@ -129,6 +130,7 @@ PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat,
* NULL if the matrix is binary. * NULL if the matrix is binary.
* @param [in] valueSize length of value array. Zero if the matrix is binary. * @param [in] valueSize length of value array. Zero if the matrix is binary.
* @return paddle_error * @return paddle_error
* @note Mobile inference does not support this interface.
*/ */
PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat, PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
int* rowArray, int* rowArray,
......
...@@ -27,7 +27,9 @@ if(WITH_GPU) ...@@ -27,7 +27,9 @@ if(WITH_GPU)
set_source_files_properties(${CUDA_CXX_SOURCES} set_source_files_properties(${CUDA_CXX_SOURCES}
PROPERTIES COMPILE_FLAGS "-D__NVCC__") PROPERTIES COMPILE_FLAGS "-D__NVCC__")
else() else()
if (NOT MOBILE_INFERENCE)
set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc) set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
endif()
endif() endif()
set(CUDA_CU_SOURCES set(CUDA_CU_SOURCES
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
/** /**
* @brief Maximum pool forward. * @brief Maximum pool forward with Mask output.
* *
* @param[in] frameCnt batch size of input image. * @param[in] frameCnt batch size of input image.
* @param[in] inputData input data. * @param[in] inputData input data.
...@@ -35,7 +35,7 @@ limitations under the License. */ ...@@ -35,7 +35,7 @@ limitations under the License. */
* @param[in] paddingW padding width. * @param[in] paddingW padding width.
* @param[out] tgtData output data. * @param[out] tgtData output data.
* @param[in] tgtStride stride between output data samples. * @param[in] tgtStride stride between output data samples.
* * @param[out] maskData the location indices of select max data.
*/ */
extern void hl_maxpool_forward(const int frameCnt, extern void hl_maxpool_forward(const int frameCnt,
const real* inputData, const real* inputData,
...@@ -51,7 +51,8 @@ extern void hl_maxpool_forward(const int frameCnt, ...@@ -51,7 +51,8 @@ extern void hl_maxpool_forward(const int frameCnt,
const int paddingH, const int paddingH,
const int paddingW, const int paddingW,
real* tgtData, real* tgtData,
const int tgtStride); const int tgtStride,
real* maskData = NULL);
/** /**
* @brief Maximum pool backward. * @brief Maximum pool backward.
......
...@@ -25,7 +25,9 @@ limitations under the License. */ ...@@ -25,7 +25,9 @@ limitations under the License. */
#include "hl_matrix.h" #include "hl_matrix.h"
#include "hl_sequence.h" #include "hl_sequence.h"
#include "hl_sparse.h" #include "hl_sparse.h"
#ifndef PADDLE_MOBILE_INFERENCE
#include "hl_warpctc_wrap.h" #include "hl_warpctc_wrap.h"
#endif
#ifdef HPPL_STUB_FUNC #ifdef HPPL_STUB_FUNC
#include "stub/hl_aggregate_stub.h" #include "stub/hl_aggregate_stub.h"
......
...@@ -31,7 +31,8 @@ inline void hl_maxpool_forward(const int frameCnt, ...@@ -31,7 +31,8 @@ inline void hl_maxpool_forward(const int frameCnt,
const int paddingH, const int paddingH,
const int paddingW, const int paddingW,
real* tgtData, real* tgtData,
const int tgtStride) {} const int tgtStride,
real* MaskData) {}
inline void hl_maxpool_backward(const int frameCnt, inline void hl_maxpool_backward(const int frameCnt,
const real* inputData, const real* inputData,
......
...@@ -31,7 +31,8 @@ __global__ void KeMaxPoolForward(const int nthreads, ...@@ -31,7 +31,8 @@ __global__ void KeMaxPoolForward(const int nthreads,
const int offsetH, const int offsetH,
const int offsetW, const int offsetW,
real* tgtData, real* tgtData,
const int tgtStride) { const int tgtStride,
real* maskData) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nthreads) { if (index < nthreads) {
int pw = index % pooledW; int pw = index % pooledW;
...@@ -45,16 +46,22 @@ __global__ void KeMaxPoolForward(const int nthreads, ...@@ -45,16 +46,22 @@ __global__ void KeMaxPoolForward(const int nthreads,
hstart = max(hstart, 0); hstart = max(hstart, 0);
wstart = max(wstart, 0); wstart = max(wstart, 0);
real maxval = -FLT_MAX; real maxval = -FLT_MAX;
int max_index = -1;
inputData += (frameNum * channels + c) * height * width; inputData += (frameNum * channels + c) * height * width;
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
if (maxval < inputData[h * width + w]) if (maxval < inputData[h * width + w]) {
maxval = inputData[h * width + w]; max_index = h * width + w;
maxval = inputData[max_index];
}
} }
} }
int tgtIndex = int tgtIndex =
index % (pooledW * pooledH * channels) + frameNum * tgtStride; index % (pooledW * pooledH * channels) + frameNum * tgtStride;
tgtData[tgtIndex] = maxval; tgtData[tgtIndex] = maxval;
if (maskData != NULL) {
maskData[tgtIndex] = max_index;
}
} }
} }
...@@ -72,7 +79,8 @@ void hl_maxpool_forward(const int frameCnt, ...@@ -72,7 +79,8 @@ void hl_maxpool_forward(const int frameCnt,
const int paddingH, const int paddingH,
const int paddingW, const int paddingW,
real* tgtData, real* tgtData,
const int tgtStride) { const int tgtStride,
real* maskData) {
int num_kernels = pooledH * pooledW * channels * frameCnt; int num_kernels = pooledH * pooledW * channels * frameCnt;
int blocks = (num_kernels + 1024 - 1) / 1024; int blocks = (num_kernels + 1024 - 1) / 1024;
dim3 threads(1024, 1); dim3 threads(1024, 1);
...@@ -92,7 +100,8 @@ void hl_maxpool_forward(const int frameCnt, ...@@ -92,7 +100,8 @@ void hl_maxpool_forward(const int frameCnt,
paddingH, paddingH,
paddingW, paddingW,
tgtData, tgtData,
tgtStride); tgtStride,
maskData);
CHECK_SYNC("hl_maxpool_forward failed"); CHECK_SYNC("hl_maxpool_forward failed");
} }
......
...@@ -38,9 +38,9 @@ py_proto_compile(framework_py_proto SRCS framework.proto) ...@@ -38,9 +38,9 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init) add_dependencies(framework_py_proto framework_py_proto_init)
add_custom_command(TARGET framework_py_proto POST_BUILD add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto
COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto/ COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto/
COMMENT "Copy generated python proto into directory paddle/v2/framework/proto." COMMENT "Copy generated python proto into directory paddle/v2/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
cc_library(backward SRCS backward.cc DEPS net_op) cc_library(backward SRCS backward.cc DEPS net_op)
......
...@@ -270,6 +270,19 @@ static bool AllGradInSet(const std::vector<std::string>& names, ...@@ -270,6 +270,19 @@ static bool AllGradInSet(const std::vector<std::string>& names,
return false; return false;
} }
} }
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
sout << "All input {";
for (auto& name : names) {
sout << name << ",";
}
sout << "} is in {";
for (auto& name : set) {
sout << name << ",";
}
sout << "}";
VLOG(10) << sout.str();
}
return true; return true;
} }
...@@ -290,14 +303,12 @@ static void CreateGradVarInBlock( ...@@ -290,14 +303,12 @@ static void CreateGradVarInBlock(
auto ops = block_desc->AllOps(); auto ops = block_desc->AllOps();
for (size_t op_index = grad_op_start_index; op_index < ops.size(); for (size_t op_index = grad_op_start_index; op_index < ops.size();
++op_index) { ++op_index) {
bool need_infer_shape = false;
std::unordered_set<std::string> new_vars; std::unordered_set<std::string> new_vars;
ForEachVarName(ops[op_index]->Outputs(), ForEachVarName(ops[op_index]->Outputs(),
[&](const std::string& grad_var_name) { [&](const std::string& grad_var_name) {
if (block_desc->HasVar(grad_var_name)) { if (block_desc->HasVar(grad_var_name)) {
return false; return false;
} }
need_infer_shape = true;
auto var = block_desc->Var(grad_var_name); auto var = block_desc->Var(grad_var_name);
new_vars.insert(var->Name()); new_vars.insert(var->Name());
auto it = param_name_map.find(grad_var_name); auto it = param_name_map.find(grad_var_name);
...@@ -311,23 +322,21 @@ static void CreateGradVarInBlock( ...@@ -311,23 +322,21 @@ static void CreateGradVarInBlock(
grad_record.op_idx_ = static_cast<int>(op_index); grad_record.op_idx_ = static_cast<int>(op_index);
return false; /* not break */ return false; /* not break */
}); });
if (need_infer_shape) { ops[op_index]->InferVarType(block_desc);
ops[op_index]->InferVarType(block_desc); for (auto& arg : ops[op_index]->OutputArgumentNames()) {
for (auto& arg : ops[op_index]->OutputArgumentNames()) { if (new_vars.find(arg) == new_vars.end()) {
if (new_vars.find(arg) == new_vars.end()) { continue;
continue; }
} auto pname = FwdName(arg);
auto pname = FwdName(arg); auto* param = block_desc->FindVarRecursive(pname);
auto* param = block_desc->FindVarRecursive(pname); auto* grad = block_desc->FindVar(arg);
auto* grad = block_desc->FindVar(arg); if (param == nullptr) {
if (param == nullptr) { grad->SetDataType(DataType::FP32);
grad->SetDataType(DataType::FP32); } else {
} else { grad->SetDataType(param->GetDataType());
grad->SetDataType(param->GetDataType());
}
} }
ops[op_index]->InferShape(*block_desc);
} }
ops[op_index]->InferShape(*block_desc);
} }
} }
...@@ -377,10 +386,17 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad( ...@@ -377,10 +386,17 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
return grad_op_descs; return grad_op_descs;
} }
static BlockDescBind* CreateStepBlock(
ProgramDescBind& program_desc,
std::unordered_set<std::string>* no_grad_vars,
std::unordered_map<std::string, std::string>* grad_to_var,
int step_block_idx);
std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
ProgramDescBind& program_desc, int block_idx, ProgramDescBind& program_desc, int block_idx,
std::unordered_set<std::string>* no_grad_vars, std::unordered_set<std::string>* no_grad_vars,
std::unordered_map<std::string, std::string>* grad_to_var) { std::unordered_map<std::string, std::string>* grad_to_var) {
VLOG(5) << "MakeBlockBackward";
BlockDescBind* cur_block = program_desc.MutableBlock(block_idx); BlockDescBind* cur_block = program_desc.MutableBlock(block_idx);
std::vector<OpDescBind*> op_descs = cur_block->AllOps(); std::vector<OpDescBind*> op_descs = cur_block->AllOps();
std::unordered_map<std::string, std::vector<size_t>> dup_out_ops; std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
...@@ -388,22 +404,32 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -388,22 +404,32 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
std::vector<std::unique_ptr<OpDescBind>> backward_descs; std::vector<std::unique_ptr<OpDescBind>> backward_descs;
for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) { for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
VLOG(5) << "Making backward " << (*it)->Type() << " op";
std::vector<std::unique_ptr<OpDescBind>> op_grads; std::vector<std::unique_ptr<OpDescBind>> op_grads;
if ((*it)->Type() == "recurrent") { if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
int step_block_idx = (*it)->GetBlockAttr("step_block"); int step_block_idx = (*it)->GetBlockAttr("step_block");
auto backward_block_op_descs = MakeBlockBackward( BlockDescBind* backward_block = CreateStepBlock(
program_desc, step_block_idx, no_grad_vars, grad_to_var); program_desc, no_grad_vars, grad_to_var, step_block_idx);
op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
} else if ((*it)->Type() == "conditional_block") {
BlockDescBind* backward_block = BlockDescBind* backward_block =
program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx)); CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
for (auto& ptr : backward_block_op_descs) { (*it)->GetBlockAttr("block"));
backward_block->AppendAllocatedOp(std::move(ptr));
}
op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block}); op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
} else { } else {
op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var); op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
} }
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
sout << "Made ";
for (auto& op_grad : op_grads) {
sout << op_grad->Type() << " ";
}
VLOG(10) << sout.str();
}
for (const auto& desc : op_grads) { for (const auto& desc : op_grads) {
for (const std::string& out_name : desc->OutputArgumentNames()) { for (const std::string& out_name : desc->OutputArgumentNames()) {
if (out_name.find("@GRAD") == std::string::npos) { if (out_name.find("@GRAD") == std::string::npos) {
...@@ -419,6 +445,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -419,6 +445,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs), op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
[](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); }); [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
} }
VLOG(5) << "Appending Sums";
// Check whether some variables are written more than once // Check whether some variables are written more than once
std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops; std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
for (const auto& dup : dup_out_ops) { for (const auto& dup : dup_out_ops) {
...@@ -426,16 +454,22 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -426,16 +454,22 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
const std::vector<size_t> dup_op = dup.second; const std::vector<size_t> dup_op = dup.second;
if (out_name != kEmptyVarName && dup_op.size() > 1) { if (out_name != kEmptyVarName && dup_op.size() > 1) {
std::vector<std::string> sum_op_inputs; std::vector<std::string> sum_op_inputs;
std::string next_g_name = out_name;
for (size_t i = 0; i < dup_op.size(); ++i) { for (size_t i = 0; i < dup_op.size(); ++i) {
VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
<< " duplicated";
std::string new_name = out_name + "@RENAME@" + std::to_string(i); std::string new_name = out_name + "@RENAME@" + std::to_string(i);
backward_descs[dup_op[i]]->Rename(out_name, new_name); backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
sum_op_inputs.emplace_back(new_name); sum_op_inputs.emplace_back(new_name);
next_g_name = sum_op_inputs.back();
} }
std::unique_ptr<OpDescBind> sum_op(new OpDescBind( std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
"sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {})); "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)}); pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
} }
} }
pending_sum_ops.sort( pending_sum_ops.sort(
[](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a, [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) { const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
...@@ -446,9 +480,26 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -446,9 +480,26 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
std::move(p.second)); std::move(p.second));
} }
VLOG(5) << "MakeBlockBackward Finished";
return backward_descs; return backward_descs;
} }
static BlockDescBind* CreateStepBlock(
ProgramDescBind& program_desc,
std::unordered_set<std::string>* no_grad_vars,
std::unordered_map<std::string, std::string>* grad_to_var,
int step_block_idx) {
auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
no_grad_vars, grad_to_var);
BlockDescBind* backward_block =
program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
for (auto& ptr : backward_block_op_descs) {
backward_block->AppendAllocatedOp(move(ptr));
}
return backward_block;
}
ParamGradInfoMap AppendBackward( ParamGradInfoMap AppendBackward(
ProgramDescBind& program_desc, const VarDescBind& target, ProgramDescBind& program_desc, const VarDescBind& target,
const std::unordered_set<std::string>& no_grad_vars) { const std::unordered_set<std::string>& no_grad_vars) {
...@@ -462,19 +513,14 @@ ParamGradInfoMap AppendBackward( ...@@ -462,19 +513,14 @@ ParamGradInfoMap AppendBackward(
const int root_block_idx = 0; const int root_block_idx = 0;
auto root_block = program_desc.MutableBlock(root_block_idx); auto root_block = program_desc.MutableBlock(root_block_idx);
// insert fill one op for target
// TODO(qiao) add some check to the target.
std::string fill_one_op_out = GradVarName(target.Name()); std::string fill_one_op_out = GradVarName(target.Name());
std::vector<int64_t> target_shape_desc = target.Shape(); bool is_scalar = target.Shape() == std::vector<int64_t>{1};
std::vector<int> target_shape; PADDLE_ENFORCE(is_scalar, "target should be scalar");
std::transform(target_shape_desc.begin(), target_shape_desc.end(),
std::back_inserter(target_shape),
[](int64_t dim) { return static_cast<int>(dim); });
VLOG(3) << "backward from loss=" << target.Name() VLOG(3) << "backward from loss=" << target.Name()
<< " data_type=" << target.GetDataType(); << " data_type=" << target.GetDataType();
std::unique_ptr<OpDescBind> fill_one_op( std::unique_ptr<OpDescBind> fill_one_op(
new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}}, new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
{{"shape", target_shape}, {{"shape", std::vector<int>{1}},
{"value", static_cast<float>(1.0)}, {"value", static_cast<float>(1.0)},
{"data_type", target.GetDataType()}})); {"data_type", target.GetDataType()}}));
// infer var type of fill_one_op // infer var type of fill_one_op
......
...@@ -508,6 +508,7 @@ TEST(Backward, simple_single_op) { ...@@ -508,6 +508,7 @@ TEST(Backward, simple_single_op) {
op->SetOutput("Out", {"out"}); op->SetOutput("Out", {"out"});
auto target = f::VarDescBind("out"); auto target = f::VarDescBind("out");
target.SetShape({1});
auto var_to_grad = AppendBackward(program, target, {}); auto var_to_grad = AppendBackward(program, target, {});
ASSERT_EQ(block->AllOps().size(), 3UL); ASSERT_EQ(block->AllOps().size(), 3UL);
...@@ -544,6 +545,7 @@ TEST(Backward, default_attribute) { ...@@ -544,6 +545,7 @@ TEST(Backward, default_attribute) {
op->CheckAttrs(); op->CheckAttrs();
auto target = f::VarDescBind("out"); auto target = f::VarDescBind("out");
target.SetShape({1});
AppendBackward(program, target, {}); AppendBackward(program, target, {});
ASSERT_EQ(block->AllOps().size(), 3UL); ASSERT_EQ(block->AllOps().size(), 3UL);
...@@ -581,6 +583,7 @@ TEST(Backward, simple_mult_op) { ...@@ -581,6 +583,7 @@ TEST(Backward, simple_mult_op) {
op3->SetOutput("Out", {"out3"}); op3->SetOutput("Out", {"out3"});
auto target = f::VarDescBind("out3"); auto target = f::VarDescBind("out3");
target.SetShape({1});
size_t forward_len = block->AllOps().size(); size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {}); auto var_to_grad = AppendBackward(program, target, {});
...@@ -670,6 +673,7 @@ TEST(Backward, intermedia_var_no_grad) { ...@@ -670,6 +673,7 @@ TEST(Backward, intermedia_var_no_grad) {
op4->SetOutput("Out", {"out4"}); op4->SetOutput("Out", {"out4"});
auto target = f::VarDescBind("out4"); auto target = f::VarDescBind("out4");
target.SetShape({1});
size_t forward_len = block->AllOps().size(); size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {"out3"}); auto var_to_grad = AppendBackward(program, target, {"out3"});
...@@ -730,6 +734,7 @@ TEST(Backward, var_no_grad) { ...@@ -730,6 +734,7 @@ TEST(Backward, var_no_grad) {
op2->SetOutput("Z", {"z2"}); op2->SetOutput("Z", {"z2"});
auto target = f::VarDescBind("z2"); auto target = f::VarDescBind("z2");
target.SetShape({1});
size_t forward_len = block->AllOps().size(); size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {"z1"}); auto var_to_grad = AppendBackward(program, target, {"z1"});
...@@ -810,6 +815,7 @@ TEST(Backward, shared_var) { ...@@ -810,6 +815,7 @@ TEST(Backward, shared_var) {
op3->SetOutput("Out", {"out3"}); op3->SetOutput("Out", {"out3"});
auto target = f::VarDescBind("out3"); auto target = f::VarDescBind("out3");
target.SetShape({1});
size_t forward_len = block->AllOps().size(); size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {}); auto var_to_grad = AppendBackward(program, target, {});
...@@ -888,6 +894,7 @@ TEST(Backward, half_backward) { ...@@ -888,6 +894,7 @@ TEST(Backward, half_backward) {
op1->SetOutput("Out", {"out"}); op1->SetOutput("Out", {"out"});
auto target = f::VarDescBind("out"); auto target = f::VarDescBind("out");
target.SetShape({1});
size_t forward_len = block->AllOps().size(); size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {"b"}); auto var_to_grad = AppendBackward(program, target, {"b"});
f::OpDescBind *fill_op = block->AllOps()[forward_len]; f::OpDescBind *fill_op = block->AllOps()[forward_len];
......
...@@ -29,6 +29,8 @@ inline DataType ToDataType(std::type_index type) { ...@@ -29,6 +29,8 @@ inline DataType ToDataType(std::type_index type) {
return DataType::INT32; return DataType::INT32;
} else if (typeid(int64_t).hash_code() == type.hash_code()) { } else if (typeid(int64_t).hash_code() == type.hash_code()) {
return DataType::INT64; return DataType::INT64;
} else if (typeid(bool).hash_code() == type.hash_code()) {
return DataType::BOOL;
} else { } else {
PADDLE_THROW("Not supported"); PADDLE_THROW("Not supported");
} }
...@@ -44,6 +46,8 @@ inline std::type_index ToTypeIndex(DataType type) { ...@@ -44,6 +46,8 @@ inline std::type_index ToTypeIndex(DataType type) {
return typeid(int); return typeid(int);
case DataType::INT64: case DataType::INT64:
return typeid(int64_t); return typeid(int64_t);
case DataType::BOOL:
return typeid(bool);
default: default:
PADDLE_THROW("Not support type %d", type); PADDLE_THROW("Not support type %d", type);
} }
...@@ -64,6 +68,9 @@ inline void VisitDataType(DataType type, Visitor visitor) { ...@@ -64,6 +68,9 @@ inline void VisitDataType(DataType type, Visitor visitor) {
case DataType::INT64: case DataType::INT64:
visitor.template operator()<int64_t>(); visitor.template operator()<int64_t>();
break; break;
case DataType::BOOL:
visitor.template operator()<bool>();
break;
default: default:
PADDLE_THROW("Not supported"); PADDLE_THROW("Not supported");
} }
......
...@@ -60,8 +60,7 @@ void make_ddim(DDim& ddim, const int64_t* dims, int n) { ...@@ -60,8 +60,7 @@ void make_ddim(DDim& ddim, const int64_t* dims, int n) {
ddim = make_dim<9>(dims); ddim = make_dim<9>(dims);
break; break;
default: default:
throw std::invalid_argument( PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
"Dynamic dimensions must have between [1, 9] dimensions.");
} }
} }
......
...@@ -120,6 +120,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id, ...@@ -120,6 +120,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(10) << op->DebugString();
op->Run(*local_scope, *device); op->Run(*local_scope, *device);
} }
if (create_local_scope) { if (create_local_scope) {
......
...@@ -235,6 +235,23 @@ void OpDescBind::Rename(const std::string &old_name, ...@@ -235,6 +235,23 @@ void OpDescBind::Rename(const std::string &old_name,
need_update_ = true; need_update_ = true;
} }
void OpDescBind::RenameOutput(const std::string &old_name,
const std::string &new_name) {
for (auto &output : outputs_) {
std::replace(output.second.begin(), output.second.end(), old_name,
new_name);
}
need_update_ = true;
}
void OpDescBind::RenameInput(const std::string &old_name,
const std::string &new_name) {
for (auto &input : inputs_) {
std::replace(input.second.begin(), input.second.end(), old_name, new_name);
}
need_update_ = true;
}
struct SetAttrDescVisitor : public boost::static_visitor<void> { struct SetAttrDescVisitor : public boost::static_visitor<void> {
explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {} explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
mutable OpDesc::Attr *attr_; mutable OpDesc::Attr *attr_;
...@@ -448,7 +465,12 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs( ...@@ -448,7 +465,12 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const { DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name); auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
return framework::make_ddim(var->Shape()); try {
return framework::make_ddim(var->Shape());
} catch (...) {
VLOG(5) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception());
}
} }
void CompileTimeInferShapeContext::SetDim(const std::string &name, void CompileTimeInferShapeContext::SetDim(const std::string &name,
......
...@@ -73,6 +73,10 @@ class OpDescBind { ...@@ -73,6 +73,10 @@ class OpDescBind {
void Rename(const std::string &old_name, const std::string &new_name); void Rename(const std::string &old_name, const std::string &new_name);
void RenameOutput(const std::string &old_name, const std::string &new_name);
void RenameInput(const std::string &old_name, const std::string &new_name);
// Only be used in C++ // Only be used in C++
const AttributeMap &GetAttrMap() const; const AttributeMap &GetAttrMap() const;
......
...@@ -403,19 +403,6 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -403,19 +403,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
void OperatorWithKernel::Run(const Scope& scope, void OperatorWithKernel::Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const { const platform::DeviceContext& dev_ctx) const {
if (VLOG_IS_ON(1)) {
auto inputs = this->InputVars();
auto outputs = this->OutputVars(true);
std::ostringstream sout;
sout << "Run operator " << this->Type() << " From [";
std::ostream_iterator<std::string> out_it(sout, ",");
std::copy(inputs.begin(), inputs.end(), out_it);
sout << "] to [";
std::copy(outputs.begin(), outputs.end(), out_it);
sout << "]";
VLOG(1) << sout.str();
}
RuntimeInferShapeContext infer_shape_ctx(*this, scope); RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
......
...@@ -38,11 +38,12 @@ Scope& Scope::NewScope() const { ...@@ -38,11 +38,12 @@ Scope& Scope::NewScope() const {
Variable* Scope::Var(const std::string& name) { Variable* Scope::Var(const std::string& name) {
auto iter = vars_.find(name); auto iter = vars_.find(name);
if (iter != vars_.end()) { if (iter != vars_.end()) {
VLOG(3) << "Get existing variable " << name;
return iter->second; return iter->second;
} }
Variable* v = new Variable(); Variable* v = new Variable();
vars_[name] = v; vars_[name] = v;
VLOG(3) << "Create variable " << name << " on scope"; VLOG(3) << "Create variable " << name;
v->name_ = &(vars_.find(name)->first); v->name_ = &(vars_.find(name)->first);
return v; return v;
} }
......
...@@ -53,6 +53,10 @@ class InferShapeContext { ...@@ -53,6 +53,10 @@ class InferShapeContext {
virtual bool IsRuntime() const = 0; virtual bool IsRuntime() const = 0;
// Note: In while op, we need this to be public
void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims);
protected: protected:
virtual framework::DDim GetDim(const std::string &name) const = 0; virtual framework::DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0; virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
...@@ -60,9 +64,6 @@ class InferShapeContext { ...@@ -60,9 +64,6 @@ class InferShapeContext {
std::vector<framework::DDim> GetDims( std::vector<framework::DDim> GetDims(
const std::vector<std::string> &names) const; const std::vector<std::string> &names) const;
void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims);
std::vector<VarDesc::VarType> GetVarTypes( std::vector<VarDesc::VarType> GetVarTypes(
const std::vector<std::string> &names) const; const std::vector<std::string> &names) const;
......
...@@ -61,6 +61,7 @@ public: ...@@ -61,6 +61,7 @@ public:
// function arguments // function arguments
strides_ = config.get<std::vector<size_t>>("strides"); strides_ = config.get<std::vector<size_t>>("strides");
paddings_ = config.get<std::vector<size_t>>("paddings"); paddings_ = config.get<std::vector<size_t>>("paddings");
dilations_ = config.get<std::vector<size_t>>("dilations");
groups_ = config.get<size_t>("groups"); groups_ = config.get<size_t>("groups");
// number of inputs and outputs // number of inputs and outputs
...@@ -118,6 +119,7 @@ protected: ...@@ -118,6 +119,7 @@ protected:
std::vector<size_t> strides_; std::vector<size_t> strides_;
std::vector<size_t> paddings_; std::vector<size_t> paddings_;
std::vector<size_t> dilations_;
/// Group size, refer to grouped convolution in /// Group size, refer to grouped convolution in
/// Alex Krizhevsky's paper: when group=2, the first half of the /// Alex Krizhevsky's paper: when group=2, the first half of the
...@@ -133,6 +135,10 @@ protected: ...@@ -133,6 +135,10 @@ protected:
inline int paddingW() const { return paddings_[1]; } inline int paddingW() const { return paddings_[1]; }
inline int dilationH() const { return dilations_[0]; }
inline int dilationW() const { return dilations_[1]; }
// A temporary memory in convolution calculation. // A temporary memory in convolution calculation.
MemoryHandlePtr memory_; MemoryHandlePtr memory_;
......
...@@ -79,45 +79,59 @@ void Convolution(const std::string& conv1, ...@@ -79,45 +79,59 @@ void Convolution(const std::string& conv1,
if (outputChannels < inputChannels) continue; if (outputChannels < inputChannels) continue;
for (size_t stride : {1, 2}) { for (size_t stride : {1, 2}) {
for (size_t padding : {0, 1}) { for (size_t padding : {0, 1}) {
if (padding >= filterSize) break; for (size_t dilation : {1, 3}) {
if (padding >= filterSize) break;
size_t filterS = (filterSize - 1) * dilation + 1;
// NNPACK only supports stride = 1 if batchSize > 1 if (inputSize + 2 * padding < filterS) break;
if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
batchSize > 1 && stride > 1)
break;
size_t outputSize = if ((conv1 == "NaiveConv-CPU" || conv2 == "NaiveConv-CPU" ||
(inputSize - filterSize + 2 * padding + stride) / stride; conv1 == "NNPACKConv-CPU" ||
VLOG(3) << " batchSize=" << batchSize conv2 == "NNPACKConv-CPU") &&
<< " inputChannels=" << inputChannels dilation > 1)
<< " inputHeight=" << inputSize break;
<< " inputWidth=" << inputSize
<< " outputChannels=" << outputChannels
<< " filterHeight=" << filterSize
<< " filterWidth=" << filterSize
<< " outputHeight=" << outputSize
<< " outputWidth=" << outputSize << " stride=" << stride
<< " padding=" << padding;
std::vector<size_t> paddings = {padding, padding}; // NNPACK only supports stride = 1 if batchSize > 1
std::vector<size_t> strides = {stride, stride}; if ((conv1 == "NNPACKConv-CPU" ||
Compare2Function<DType1, DType2> test( conv2 == "NNPACKConv-CPU") &&
conv1, batchSize > 1 && stride > 1)
conv2, break;
FuncConfig()
.set("paddings", paddings)
.set("strides", strides)
.set("groups", (size_t)1)
.set("algo", (std::string) "auto"));
TensorShape input{ size_t outputSize =
batchSize, inputChannels, inputSize, inputSize}; (inputSize - filterS + 2 * padding + stride) / stride;
TensorShape filter{ VLOG(3) << " batchSize=" << batchSize
outputChannels, inputChannels, filterSize, filterSize}; << " inputChannels=" << inputChannels
TensorShape output{ << " inputHeight=" << inputSize
batchSize, outputChannels, outputSize, outputSize}; << " inputWidth=" << inputSize
<< " outputChannels=" << outputChannels
<< " filterHeight=" << filterSize
<< " filterWidth=" << filterSize
<< " outputHeight=" << outputSize
<< " outputWidth=" << outputSize
<< " stride=" << stride << " padding=" << padding;
function(test, input, filter, output); std::vector<size_t> paddings = {padding, padding};
std::vector<size_t> strides = {stride, stride};
std::vector<size_t> dilations = {dilation, dilation};
Compare2Function<DType1, DType2> test(
conv1,
conv2,
FuncConfig()
.set("paddings", paddings)
.set("strides", strides)
.set("dilations", dilations)
.set("groups", (size_t)1)
.set("algo", (std::string) "auto"));
TensorShape input{
batchSize, inputChannels, inputSize, inputSize};
TensorShape filter{
outputChannels, inputChannels, filterSize, filterSize};
TensorShape output{
batchSize, outputChannels, outputSize, outputSize};
function(test, input, filter, output);
}
} }
} }
} }
...@@ -144,6 +158,7 @@ void Convolution2(const std::string& conv1, ...@@ -144,6 +158,7 @@ void Convolution2(const std::string& conv1,
for (size_t outputChannels : {7}) { for (size_t outputChannels : {7}) {
size_t stride = 1; size_t stride = 1;
size_t padding = 0; size_t padding = 0;
size_t dilation = 1;
size_t outputHeight = size_t outputHeight =
(inputHeight - filterHeight + 2 * padding + stride) / (inputHeight - filterHeight + 2 * padding + stride) /
stride; stride;
...@@ -162,6 +177,7 @@ void Convolution2(const std::string& conv1, ...@@ -162,6 +177,7 @@ void Convolution2(const std::string& conv1,
std::vector<size_t> paddings = {padding, padding}; std::vector<size_t> paddings = {padding, padding};
std::vector<size_t> strides = {stride, stride}; std::vector<size_t> strides = {stride, stride};
std::vector<size_t> dilations = {dilation, dilation};
Compare2Function<DType1, DType2> test( Compare2Function<DType1, DType2> test(
conv1, conv1,
conv2, conv2,
...@@ -169,6 +185,7 @@ void Convolution2(const std::string& conv1, ...@@ -169,6 +185,7 @@ void Convolution2(const std::string& conv1,
.set("paddings", paddings) .set("paddings", paddings)
.set("strides", strides) .set("strides", strides)
.set("groups", (size_t)1) .set("groups", (size_t)1)
.set("dilations", dilations)
.set("algo", (std::string) "auto")); .set("algo", (std::string) "auto"));
TensorShape input{ TensorShape input{
...@@ -223,6 +240,7 @@ void DepthwiseConvolution(const std::string& conv1, ...@@ -223,6 +240,7 @@ void DepthwiseConvolution(const std::string& conv1,
std::vector<size_t> paddings = {padding, padding}; std::vector<size_t> paddings = {padding, padding};
std::vector<size_t> strides = {stride, stride}; std::vector<size_t> strides = {stride, stride};
std::vector<size_t> dilations = {1, 1};
size_t groups = inputChannels; size_t groups = inputChannels;
Compare2Function<DType1, DType2> test( Compare2Function<DType1, DType2> test(
conv1, conv1,
...@@ -231,6 +249,7 @@ void DepthwiseConvolution(const std::string& conv1, ...@@ -231,6 +249,7 @@ void DepthwiseConvolution(const std::string& conv1,
.set("paddings", paddings) .set("paddings", paddings)
.set("strides", strides) .set("strides", strides)
.set("groups", groups) .set("groups", groups)
.set("dilations", dilations)
.set("algo", (std::string) "auto")); .set("algo", (std::string) "auto"));
TensorShape input{ TensorShape input{
......
...@@ -100,7 +100,9 @@ public: ...@@ -100,7 +100,9 @@ public:
strideH(), strideH(),
strideW(), strideW(),
paddingH(), paddingH(),
paddingW()); paddingW(),
dilationH(),
dilationW());
} else { } else {
colData = inputData + g * inputOffset; colData = inputData + g * inputOffset;
} }
...@@ -223,7 +225,9 @@ public: ...@@ -223,7 +225,9 @@ public:
strideH(), strideH(),
strideW(), strideW(),
paddingH(), paddingH(),
paddingW()); paddingW(),
dilationH(),
dilationW());
} }
} }
inputGrad += inputChannels * inputHeight * inputWidth; inputGrad += inputChannels * inputHeight * inputWidth;
...@@ -310,7 +314,9 @@ public: ...@@ -310,7 +314,9 @@ public:
strideH(), strideH(),
strideW(), strideW(),
paddingH(), paddingH(),
paddingW()); paddingW(),
dilationH(),
dilationW());
} else { } else {
colData = inputData + g * inputOffset; colData = inputData + g * inputOffset;
} }
......
...@@ -78,7 +78,9 @@ public: ...@@ -78,7 +78,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth); int paddingWidth,
int dilationHeight = 1,
int dilationWidth = 1);
}; };
template <ColFormat Format, DeviceType Device, class T> template <ColFormat Format, DeviceType Device, class T>
...@@ -91,7 +93,9 @@ public: ...@@ -91,7 +93,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth); int paddingWidth,
int dilationHeight = 1,
int dilationWidth = 1);
}; };
} // namespace paddle } // namespace paddle
...@@ -31,7 +31,9 @@ public: ...@@ -31,7 +31,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight,
int dilationWidth) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -47,8 +49,8 @@ public: ...@@ -47,8 +49,8 @@ public:
int c_im = c / filterWidth / filterHeight; int c_im = c / filterWidth / filterHeight;
for (int h = 0; h < outputHeight; ++h) { for (int h = 0; h < outputHeight; ++h) {
for (int w = 0; w < outputWidth; ++w) { for (int w = 0; w < outputWidth; ++w) {
int imRowIdx = h * strideHeight + hOffset; int imRowIdx = h * strideHeight + hOffset * dilationHeight;
int imColIdx = w * strideWidth + wOffset; int imColIdx = w * strideWidth + wOffset * dilationWidth;
if ((imRowIdx - paddingHeight) < 0 || if ((imRowIdx - paddingHeight) < 0 ||
(imRowIdx - paddingHeight) >= inputHeight || (imRowIdx - paddingHeight) >= inputHeight ||
(imColIdx - paddingWidth) < 0 || (imColIdx - paddingWidth) < 0 ||
...@@ -81,7 +83,9 @@ public: ...@@ -81,7 +83,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight,
int dilationWidth) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -97,8 +101,8 @@ public: ...@@ -97,8 +101,8 @@ public:
int c_im = c / filterWidth / filterHeight; int c_im = c / filterWidth / filterHeight;
for (int h = 0; h < outputHeight; ++h) { for (int h = 0; h < outputHeight; ++h) {
for (int w = 0; w < outputWidth; ++w) { for (int w = 0; w < outputWidth; ++w) {
int imRowIdx = h * strideHeight + hOffset; int imRowIdx = h * strideHeight + hOffset * dilationHeight;
int imColIdx = w * strideWidth + wOffset; int imColIdx = w * strideWidth + wOffset * dilationWidth;
if ((imRowIdx - paddingHeight) >= 0 && if ((imRowIdx - paddingHeight) >= 0 &&
(imRowIdx - paddingHeight) < inputHeight && (imRowIdx - paddingHeight) < inputHeight &&
(imColIdx - paddingWidth) >= 0 && (imColIdx - paddingWidth) >= 0 &&
...@@ -134,7 +138,9 @@ public: ...@@ -134,7 +138,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight = 1,
int dilationWidth = 1) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -147,9 +153,10 @@ public: ...@@ -147,9 +153,10 @@ public:
for (int channel = 0; channel < inputChannels; ++channel) { for (int channel = 0; channel < inputChannels; ++channel) {
for (int filterH = 0; filterH < filterHeight; ++filterH) { for (int filterH = 0; filterH < filterHeight; ++filterH) {
for (int filterW = 0; filterW < filterWidth; ++filterW) { for (int filterW = 0; filterW < filterWidth; ++filterW) {
int imRowOffset = int imRowOffset = outputH * strideHeight +
outputH * strideHeight + filterH - paddingHeight; filterH * dilationHeight - paddingHeight;
int imColOffset = outputW * strideWidth + filterW - paddingWidth; int imColOffset = outputW * strideWidth +
filterW * dilationWidth - paddingWidth;
int colDataOffset = int colDataOffset =
(((outputH * outputWidth + outputW) * inputChannels + (((outputH * outputWidth + outputW) * inputChannels +
channel) * channel) *
...@@ -189,7 +196,9 @@ public: ...@@ -189,7 +196,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight = 1,
int dilationWidth = 1) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -202,9 +211,10 @@ public: ...@@ -202,9 +211,10 @@ public:
for (int channel = 0; channel < inputChannels; ++channel) { for (int channel = 0; channel < inputChannels; ++channel) {
for (int filterH = 0; filterH < filterHeight; ++filterH) { for (int filterH = 0; filterH < filterHeight; ++filterH) {
for (int filterW = 0; filterW < filterWidth; ++filterW) { for (int filterW = 0; filterW < filterWidth; ++filterW) {
int imRowOffset = int imRowOffset = outputH * strideHeight +
outputH * strideHeight + filterH - paddingHeight; filterH * dilationHeight - paddingHeight;
int imColOffset = outputW * strideWidth + filterW - paddingWidth; int imColOffset = outputW * strideWidth +
filterW * dilationWidth - paddingWidth;
int colDataOffset = int colDataOffset =
(((outputH * outputWidth + outputW) * inputChannels + (((outputH * outputWidth + outputW) * inputChannels +
channel) * channel) *
......
...@@ -28,6 +28,8 @@ __global__ void im2col(const T* data_im, ...@@ -28,6 +28,8 @@ __global__ void im2col(const T* data_im,
int strideW, int strideW,
int paddingH, int paddingH,
int paddingW, int paddingW,
int dilationH,
int dilationW,
int height_col, int height_col,
int width_col, int width_col,
T* data_col) { T* data_col) {
...@@ -44,8 +46,8 @@ __global__ void im2col(const T* data_im, ...@@ -44,8 +46,8 @@ __global__ void im2col(const T* data_im,
data_col += (channel_out * height_col + h_out) * width_col + w_out; data_col += (channel_out * height_col + h_out) * width_col + w_out;
for (int i = 0; i < blockH; ++i) { for (int i = 0; i < blockH; ++i) {
for (int j = 0; j < blockW; ++j) { for (int j = 0; j < blockW; ++j) {
int rIdx = int(h_in + i); int rIdx = int(h_in + i * dilationH);
int cIdx = int(w_in + j); int cIdx = int(w_in + j * dilationW);
if ((rIdx - (int)paddingH) >= (int)height || if ((rIdx - (int)paddingH) >= (int)height ||
(rIdx - (int)paddingH) < 0 || (rIdx - (int)paddingH) < 0 ||
(cIdx - (int)paddingW) >= (int)width || (cIdx - (int)paddingW) >= (int)width ||
...@@ -77,7 +79,9 @@ public: ...@@ -77,7 +79,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight,
int dilationWidth) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -102,6 +106,8 @@ public: ...@@ -102,6 +106,8 @@ public:
strideWidth, strideWidth,
paddingHeight, paddingHeight,
paddingWidth, paddingWidth,
dilationHeight,
dilationWidth,
outputHeight, outputHeight,
outputWidth, outputWidth,
colData); colData);
...@@ -121,6 +127,8 @@ __global__ void col2im(size_t n, ...@@ -121,6 +127,8 @@ __global__ void col2im(size_t n,
size_t strideW, size_t strideW,
size_t paddingH, size_t paddingH,
size_t paddingW, size_t paddingW,
size_t dilationH,
size_t dilationW,
size_t height_col, size_t height_col,
size_t width_col, size_t width_col,
T* data_im) { T* data_im) {
...@@ -131,23 +139,34 @@ __global__ void col2im(size_t n, ...@@ -131,23 +139,34 @@ __global__ void col2im(size_t n,
int w = int(index % width); int w = int(index % width);
int h = int((index / width) % height); int h = int((index / width) % height);
int c = int(index / (width * height)); int c = int(index / (width * height));
int filterH = (blockH - 1) * dilationH + 1;
int filterW = (blockW - 1) * dilationW + 1;
if ((w - (int)paddingW) >= 0 && if ((w - (int)paddingW) >= 0 &&
(w - (int)paddingW) < (width - 2 * paddingW) && (w - (int)paddingW) < (width - 2 * paddingW) &&
(h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) { (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
// compute the start and end of the output // compute the start and end of the output
int w_col_start = int w_col_start =
(w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; (w < (int)filterW) ? 0 : (w - int(filterW)) / (int)strideW + 1;
int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col)); int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
int h_col_start = int h_col_start =
(h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; (h < (int)filterH) ? 0 : (h - (int)filterH) / (int)strideH + 1;
int h_col_end = min(int(h / strideH + 1), int(height_col)); int h_col_end = min(int(h / strideH + 1), int(height_col));
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
// the col location: [c * width * height + h_out, w_out] // the col location: [c * width * height + h_out, w_out]
int c_col = int(c * blockH * blockW) + int h_k = (h - h_col * strideH);
(h - h_col * (int)strideH) * (int)blockW + int w_k = (w - w_col * strideW);
(w - w_col * (int)strideW); if (h_k % dilationH == 0 && w_k % dilationW == 0) {
val += data_col[(c_col * height_col + h_col) * width_col + w_col]; h_k /= dilationH;
w_k /= dilationW;
int c_col =
(((c * blockH + h_k) * blockW + w_k) * height_col + h_col) *
width_col +
w_col;
val += data_col[c_col];
}
} }
} }
h -= paddingH; h -= paddingH;
...@@ -173,7 +192,9 @@ public: ...@@ -173,7 +192,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight,
int dilationWidth) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -205,6 +226,8 @@ public: ...@@ -205,6 +226,8 @@ public:
strideWidth, strideWidth,
paddingHeight, paddingHeight,
paddingWidth, paddingWidth,
dilationHeight,
dilationWidth,
outputHeight, outputHeight,
outputWidth, outputWidth,
imData); imData);
...@@ -229,6 +252,8 @@ __global__ void im2colOCF(const T* imData, ...@@ -229,6 +252,8 @@ __global__ void im2colOCF(const T* imData,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth, int paddingWidth,
int dilationHeight,
int dilationWidth,
int outputHeight, int outputHeight,
int outputWidth) { int outputWidth) {
int swId = blockIdx.x; int swId = blockIdx.x;
...@@ -237,8 +262,10 @@ __global__ void im2colOCF(const T* imData, ...@@ -237,8 +262,10 @@ __global__ void im2colOCF(const T* imData,
channelId += blockDim.z) { channelId += blockDim.z) {
for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
int widthOffset = idx + swId * strideWidth - paddingWidth; int widthOffset =
int heightOffset = idy + shId * strideHeight - paddingHeight; idx * dilationHeight + swId * strideWidth - paddingWidth;
int heightOffset =
idy * dilationWidth + shId * strideHeight - paddingHeight;
int imOffset = widthOffset + heightOffset * inputWidth + int imOffset = widthOffset + heightOffset * inputWidth +
channelId * inputHeight * inputWidth; channelId * inputHeight * inputWidth;
...@@ -273,7 +300,9 @@ public: ...@@ -273,7 +300,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight,
int dilationWidth) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -312,6 +341,8 @@ public: ...@@ -312,6 +341,8 @@ public:
strideWidth, strideWidth,
paddingHeight, paddingHeight,
paddingWidth, paddingWidth,
dilationHeight,
dilationWidth,
outputHeight, outputHeight,
outputWidth); outputWidth);
CHECK_SYNC("Im2ColFunctor GPU failed"); CHECK_SYNC("Im2ColFunctor GPU failed");
...@@ -330,6 +361,8 @@ __global__ void col2imOCF(T* imData, ...@@ -330,6 +361,8 @@ __global__ void col2imOCF(T* imData,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth, int paddingWidth,
int dilationHeight,
int dilationWidth,
int outputHeight, int outputHeight,
int outputWidth) { int outputWidth) {
int swId = blockIdx.x; int swId = blockIdx.x;
...@@ -338,8 +371,10 @@ __global__ void col2imOCF(T* imData, ...@@ -338,8 +371,10 @@ __global__ void col2imOCF(T* imData,
channelId += blockDim.z) { channelId += blockDim.z) {
for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
int widthOffset = idx + swId * strideWidth - paddingWidth; int widthOffset =
int heightOffset = idy + shId * strideHeight - paddingHeight; idx * dilationWidth + swId * strideWidth - paddingWidth;
int heightOffset =
idy * dilationHeight + shId * strideHeight - paddingHeight;
int imOffset = widthOffset + heightOffset * inputWidth + int imOffset = widthOffset + heightOffset * inputWidth +
channelId * inputHeight * inputWidth; channelId * inputHeight * inputWidth;
...@@ -372,7 +407,9 @@ public: ...@@ -372,7 +407,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight,
int dilationWidth) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -411,6 +448,8 @@ public: ...@@ -411,6 +448,8 @@ public:
strideWidth, strideWidth,
paddingHeight, paddingHeight,
paddingWidth, paddingWidth,
dilationHeight,
dilationWidth,
outputHeight, outputHeight,
outputWidth); outputWidth);
CHECK_SYNC("Col2ImFunctor GPU failed"); CHECK_SYNC("Col2ImFunctor GPU failed");
......
...@@ -29,82 +29,98 @@ void TestIm2ColFunctor() { ...@@ -29,82 +29,98 @@ void TestIm2ColFunctor() {
for (size_t filterWidth : {3, 7}) { for (size_t filterWidth : {3, 7}) {
for (size_t stride : {1, 2}) { for (size_t stride : {1, 2}) {
for (size_t padding : {0, 1}) { for (size_t padding : {0, 1}) {
if (inputHeight <= filterHeight || inputWidth <= filterWidth) for (size_t dilation : {1, 3}) {
break; size_t filterSizeH = (filterHeight - 1) * dilation + 1;
if (padding >= filterHeight || padding >= filterWidth) break; size_t filterSizeW = (filterWidth - 1) * dilation + 1;
size_t outputHeight = if (inputHeight + 2 * padding < filterSizeH ||
(inputHeight - filterHeight + 2 * padding + stride) / inputWidth + 2 * padding < filterSizeW)
stride; break;
size_t outputWidth = if (padding >= filterSizeH || padding >= filterSizeW) break;
(inputWidth - filterWidth + 2 * padding + stride) / stride; size_t outputHeight =
(inputHeight - filterSizeH + 2 * padding) / stride + 1;
TensorShape imShape = size_t outputWidth =
TensorShape({channels, inputHeight, inputWidth}); (inputWidth - filterSizeW + 2 * padding) / stride + 1;
TensorShape colShape1 = TensorShape({channels,
filterHeight, TensorShape imShape =
filterWidth, TensorShape({channels, inputHeight, inputWidth});
outputHeight, TensorShape colShape1 = TensorShape({channels,
outputWidth}); filterHeight,
TensorShape colShape2 = TensorShape({outputHeight, filterWidth,
outputWidth, outputHeight,
channels, outputWidth});
filterHeight, TensorShape colShape2 = TensorShape({outputHeight,
filterWidth}); outputWidth,
channels,
size_t height = channels * filterHeight * filterWidth; filterHeight,
size_t width = outputHeight * outputWidth; filterWidth});
VectorPtr input1 = Vector::create(imShape.getElements(), false);
VectorPtr input2 = Vector::create(imShape.getElements(), false); size_t height = channels * filterHeight * filterWidth;
MatrixPtr output1 = Matrix::create(height, width, false, false); size_t width = outputHeight * outputWidth;
MatrixPtr output2 = Matrix::create(width, height, false, false); VectorPtr input1 =
input1->uniform(0.001, 1); Vector::create(imShape.getElements(), false);
input2->copyFrom(*input1); VectorPtr input2 =
Vector::create(imShape.getElements(), false);
Im2ColFunctor<kCFO, Device, T> im2Col1; MatrixPtr output1 =
Im2ColFunctor<kOCF, Device, T> im2Col2; Matrix::create(height, width, false, false);
im2Col1(input1->getData(), MatrixPtr output2 =
imShape, Matrix::create(width, height, false, false);
output1->getData(), input1->uniform(0.001, 1);
colShape1, input2->copyFrom(*input1);
stride,
stride, Im2ColFunctor<kCFO, Device, T> im2Col1;
padding, Im2ColFunctor<kOCF, Device, T> im2Col2;
padding); im2Col1(input1->getData(),
im2Col2(input2->getData(), imShape,
imShape, output1->getData(),
output2->getData(), colShape1,
colShape2, stride,
stride, stride,
stride, padding,
padding, padding,
padding); dilation,
dilation);
// The transposition of the result of ColFormat == kCFO im2Col2(input2->getData(),
// is equal to the result of ColFormat == kOCF. imShape,
MatrixPtr test; output2->getData(),
output2->transpose(test, true); colShape2,
autotest::TensorCheckErr(*output1, *test); stride,
stride,
Col2ImFunctor<kCFO, Device, T> col2Im1; padding,
Col2ImFunctor<kOCF, Device, T> col2Im2; padding,
col2Im1(input1->getData(), dilation,
imShape, dilation);
output1->getData(),
colShape1, // The transposition of the result of ColFormat == kCFO
stride, // is equal to the result of ColFormat == kOCF.
stride, MatrixPtr test;
padding, output2->transpose(test, true);
padding); autotest::TensorCheckErr(*output1, *test);
col2Im2(input2->getData(),
imShape, Col2ImFunctor<kCFO, Device, T> col2Im1;
output2->getData(), Col2ImFunctor<kOCF, Device, T> col2Im2;
colShape2,
stride, col2Im1(input1->getData(),
stride, imShape,
padding, output1->getData(),
padding); colShape1,
stride,
autotest::TensorCheckErr(*input1, *input2); stride,
padding,
padding,
dilation,
dilation);
col2Im2(input2->getData(),
imShape,
output2->getData(),
colShape2,
stride,
stride,
padding,
padding,
dilation,
dilation);
autotest::TensorCheckErr(*input1, *input2);
}
} }
} }
} }
......
...@@ -73,7 +73,6 @@ if(MOBILE_INFERENCE) ...@@ -73,7 +73,6 @@ if(MOBILE_INFERENCE)
list(REMOVE_ITEM GSERVER_SOURCES list(REMOVE_ITEM GSERVER_SOURCES
dataproviders/DataProvider.cpp dataproviders/DataProvider.cpp
dataproviders/MultiDataProvider.cpp dataproviders/MultiDataProvider.cpp
dataproviders/ProtoDataProvider.cpp
dataproviders/PyDataProvider2.cpp dataproviders/PyDataProvider2.cpp
dataproviders/PyDataProvider.cpp) dataproviders/PyDataProvider.cpp)
...@@ -85,9 +84,49 @@ if(MOBILE_INFERENCE) ...@@ -85,9 +84,49 @@ if(MOBILE_INFERENCE)
gradientmachines/GradientMachineMode.cpp gradientmachines/GradientMachineMode.cpp
gradientmachines/MultiGradientMachine.cpp) gradientmachines/MultiGradientMachine.cpp)
# Remove useless layers # Remove layers that used in training
list(REMOVE_ITEM GSERVER_SOURCES list(REMOVE_ITEM GSERVER_SOURCES
layers/RecurrentLayerGroup.cpp) layers/RecurrentLayerGroup.cpp
layers/CostLayer.cpp
layers/MultiBoxLossLayer.cpp
layers/WarpCTCLayer.cpp
layers/CTCLayer.cpp
layers/LinearChainCTC.cpp
layers/PrintLayer.cpp)
list(REMOVE_ITEM GSERVER_SOURCES
layers/OuterProdLayer.cpp
layers/SumToOneNormLayer.cpp
layers/ConvShiftLayer.cpp
layers/InterpolationLayer.cpp
layers/AgentLayer.cpp
layers/DotMulOperator.cpp
layers/GruStepLayer.cpp
layers/LstmStepLayer.cpp
layers/ConvexCombinationLayer.cpp
layers/Conv3DLayer.cpp
layers/DeConv3DLayer.cpp
layers/CropLayer.cpp
layers/CrossEntropyOverBeam.cpp
layers/DataNormLayer.cpp
layers/FeatureMapExpandLayer.cpp
layers/HierarchicalSigmoidLayer.cpp
layers/MultinomialSampler.cpp
layers/NCELayer.cpp
layers/KmaxSeqScoreLayer.cpp
layers/MDLstmLayer.cpp
layers/MultiplexLayer.cpp
layers/PadLayer.cpp
layers/Pool3DLayer.cpp
layers/ResizeLayer.cpp
layers/RotateLayer.cpp
layers/RowConvLayer.cpp
layers/RowL2NormLayer.cpp
layers/SamplingIdLayer.cpp
layers/ScaleShiftLayer.cpp
layers/SelectiveFullyConnectedLayer.cpp
layers/SpatialPyramidPoolLayer.cpp
layers/BilinearInterpLayer.cpp
layers/ClipLayer.cpp)
endif() endif()
if(WITH_GPU) if(WITH_GPU)
......
...@@ -16,8 +16,8 @@ limitations under the License. */ ...@@ -16,8 +16,8 @@ limitations under the License. */
#include <unistd.h> #include <unistd.h>
#include <algorithm> #include <algorithm>
#include "ProtoDataProvider.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
#include "paddle/utils/StringUtil.h" #include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h" #include "paddle/utils/Util.h"
...@@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config, ...@@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config,
REGISTER_DATA_PROVIDER(simple, SimpleDataProvider); REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
REGISTER_DATA_PROVIDER(dummy, DummyDataProvider); REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
REGISTER_DATA_PROVIDER(proto, ProtoDataProvider);
REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider);
int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) { int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch) int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "DataFormat.pb.h"
#include "paddle/utils/Stat.h"
#include "DataProvider.h"
#include "ProtoReader.h"
namespace paddle {
/**
* @brief Provider data from protobuf data file with each sample
* specified by proto message
*
* DataSample defined in DataFormat.proto.
*
* The file format is
*
* header
*
* sample1
*
* sample2
*
* ...
*
* sampleN
*
* @note: In the data file, each message is prefixed with its length.
* The read/write of the protbuf are implemented in ProtoReader.h
*/
class ProtoDataProvider : public DataProvider {
public:
ProtoDataProvider(const DataConfig& config,
bool useGpu,
bool loadDataAll = true);
virtual void reset();
/**
* @note this size includes the sequences which are skipped because they
* are longer than the batch size.
*/
virtual int64_t getSize() {
int64_t size = sampleNums_;
if (usageRatio_ < 1.0f) {
size = static_cast<int64_t>(size * usageRatio_);
}
return size;
}
virtual void shuffle();
void loadData(const std::vector<std::string>& fileList);
virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
protected:
/**
* @brief load protobuf data from a list of file
* @param[in] fileName file name of a file which contains
* a list of file names
*/
void loadData(const std::string& fileName);
/**
* @brief load protobuf data from file
* @param[in] fileName data file name
*/
void loadDataFile(const std::string& fileName);
/** @brief check data header of each data sample
* @param[in] header data header read from protobuf data
*/
void checkDataHeader(const DataHeader& header);
/**
* @brief fill protobuf data into slot_,
* slot_ is a vector of ProtoSlot in memory.
* @param[in] sample data sample read from protobuf data
*/
void fillSlots(const DataSample& sample);
/**
* @brief return true if each sample is one sequence, i.e., independent
* of other samples.
*/
inline bool iidData() const { return sequenceStartPositions_.empty(); }
/**
* @brief check that sample is consistent with header_
*/
void checkSample(const DataSample& sample);
template <class Op>
int64_t sequenceLoop(Op op, int64_t size);
template <class Op>
int64_t sampleLoop(Op op, int64_t size);
template <class Op>
int64_t subSampleLoop(Op op, int64_t size, int slot);
void showDataStats();
protected:
struct ProtoVarSlot {
std::vector<real> data;
std::vector<int> dims;
};
struct ProtoSlot {
SlotDef::SlotType type;
int dim;
std::vector<int> indexData;
std::vector<real> denseData;
std::vector<sparse_non_value_t> sparseNonValueData;
std::vector<sparse_float_value_t> sparseFloatValueData;
std::vector<int64_t> indices;
std::vector<int64_t> subIndices;
std::vector<ProtoVarSlot> varDenseData;
std::vector<std::vector<int>> varIndices;
std::vector<std::string> strData;
};
DataHeader header_;
int numVecSlots_;
std::vector<ProtoSlot> slots_;
size_t sampleNums_;
/**
* The starting position of each sequence in samples.
* The last element should be num of samples.
* If empty, each sample is one sequence.
*/
std::vector<size_t> sequenceStartPositions_;
int64_t currentSequenceIndex_;
// The size should be the number of sequences.
std::vector<size_t> shuffledSequenceIds_;
ThreadLocalD<DataBatch> cpuBatch_;
ThreadLocalD<DataBatch> gpuBatch_;
RWLock lock_;
std::vector<StatPtr> nnzStats_; // stats for number of none-zeros entries
};
/**
* @brief Special use for Proto data: instances should contain sparse-non-value
* slots
* and label.
*
* @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
*/
class ProtoSequenceDataProvider : public ProtoDataProvider {
public:
ProtoSequenceDataProvider(const DataConfig& config,
bool useGpu,
bool loadDataAll = true);
~ProtoSequenceDataProvider() {}
virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
};
} // namespace paddle
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include "NeuralNetwork.h" #include "NeuralNetwork.h"
#include "hl_gpu.h" #include "hl_gpu.h"
#include "paddle/gserver/layers/AgentLayer.h"
#include "paddle/utils/CustomStackTrace.h" #include "paddle/utils/CustomStackTrace.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h" #include "paddle/utils/Stat.h"
...@@ -28,6 +27,7 @@ limitations under the License. */ ...@@ -28,6 +27,7 @@ limitations under the License. */
#ifndef PADDLE_MOBILE_INFERENCE #ifndef PADDLE_MOBILE_INFERENCE
#include "MultiNetwork.h" #include "MultiNetwork.h"
#include "RecurrentGradientMachine.h" #include "RecurrentGradientMachine.h"
#include "paddle/gserver/layers/AgentLayer.h"
#endif #endif
namespace paddle { namespace paddle {
...@@ -192,9 +192,11 @@ void NeuralNetwork::init(const ModelConfig& config, ...@@ -192,9 +192,11 @@ void NeuralNetwork::init(const ModelConfig& config,
void NeuralNetwork::connect(LayerPtr agentLayer, void NeuralNetwork::connect(LayerPtr agentLayer,
LayerPtr realLayer, LayerPtr realLayer,
int height) { int height) {
#ifndef PADDLE_MOBILE_INFERENCE
AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get()); AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
CHECK_NOTNULL(agent); CHECK_NOTNULL(agent);
agent->setRealLayer(realLayer, height); agent->setRealLayer(realLayer, height);
#endif
} }
void NeuralNetwork::connect(std::string agentLayerName, void NeuralNetwork::connect(std::string agentLayerName,
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "Layer.h"
#include "paddle/math/Matrix.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
/**
* @brief A layer for computing the dot product of two vectors.
* Input1: vector (batchSize * dim)
* Input2: vector (batchSize * dim)
* Output: a matrix: (batchSize * 1)
*/
class DotProdLayer : public Layer {
public:
explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
~DotProdLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
};
REGISTER_LAYER(dot_prod, DotProdLayer);
bool DotProdLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK_EQ(inputLayers_.size(), 2U);
CHECK_EQ(1UL, getSize())
<< "The output dimensionality of this layer should be fixed to 1.";
return true;
}
void DotProdLayer::forward(PassType passType) {
Layer::forward(passType);
MatrixPtr inV0 = getInputValue(0);
MatrixPtr inV1 = getInputValue(1);
size_t batchSize = inV0->getHeight();
CHECK_EQ(inV1->getHeight(), batchSize);
CHECK_EQ(inV0->getWidth(), inV1->getWidth());
{
REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
reserveOutput(batchSize, 1);
}
MatrixPtr outV = getOutputValue();
{
REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
outV->sumOfProducts(*inV0, *inV1, 1, 0);
}
}
void DotProdLayer::backward(const UpdateCallback& callback) {
MatrixPtr inV0 = getInputValue(0);
MatrixPtr inV1 = getInputValue(1);
MatrixPtr outG = getOutputGrad();
MatrixPtr inG0 = getInputGrad(0);
MatrixPtr inG1 = getInputGrad(1);
{
REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
if (inG0) {
inG0->addRowScale(0, *inV1, *outG);
}
if (inG1) {
inG1->addRowScale(0, *inV0, *outG);
}
}
}
} // namespace paddle
...@@ -79,6 +79,10 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, ...@@ -79,6 +79,10 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
for (int i = 0; i < config_.inputs_size(); i++) { for (int i = 0; i < config_.inputs_size(); i++) {
std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]}; std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]}; std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
std::vector<size_t> dilations = {(size_t)dilationY_[i],
(size_t)dilation_[i]};
bool useDilation = ((size_t)dilationY_[i] > 1 || (size_t)dilation_[i] > 1);
// Convolution Layer uses the GemmConv function by default. // Convolution Layer uses the GemmConv function by default.
convType = "GemmConv"; convType = "GemmConv";
...@@ -97,13 +101,14 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, ...@@ -97,13 +101,14 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
#if defined(__ARM_NEON__) || defined(__ARM_NEON) #if defined(__ARM_NEON__) || defined(__ARM_NEON)
if ((filterSize_[i] == filterSizeY_[i]) && if ((filterSize_[i] == filterSizeY_[i]) &&
(filterSize_[i] == 3 || filterSize_[i] == 4) && (filterSize_[i] == 3 || filterSize_[i] == 4) &&
(stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2)) { (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2) &&
!useDilation) {
convType = "NeonDepthwiseConv"; convType = "NeonDepthwiseConv";
} }
#endif #endif
} }
if (FLAGS_use_nnpack && !isDeconv_) { if (FLAGS_use_nnpack && !isDeconv_ && !useDilation) {
createFunction(forward_, createFunction(forward_,
"NNPACKConv", "NNPACKConv",
FuncConfig() FuncConfig()
...@@ -117,6 +122,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, ...@@ -117,6 +122,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
FuncConfig() FuncConfig()
.set("paddings", paddings) .set("paddings", paddings)
.set("strides", strides) .set("strides", strides)
.set("dilations", dilations)
.set("groups", (size_t)groups_[i])); .set("groups", (size_t)groups_[i]));
createFunction(backward_, createFunction(backward_,
...@@ -124,6 +130,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, ...@@ -124,6 +130,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
FuncConfig() FuncConfig()
.set("paddings", paddings) .set("paddings", paddings)
.set("strides", strides) .set("strides", strides)
.set("dilations", dilations)
.set("groups", (size_t)groups_[i])); .set("groups", (size_t)groups_[i]));
createFunction(backward_, createFunction(backward_,
...@@ -131,6 +138,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, ...@@ -131,6 +138,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
FuncConfig() FuncConfig()
.set("paddings", paddings) .set("paddings", paddings)
.set("strides", strides) .set("strides", strides)
.set("dilations", dilations)
.set("groups", (size_t)groups_[i])); .set("groups", (size_t)groups_[i]));
} }
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "L2DistanceLayer.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
REGISTER_LAYER(l2_distance, L2DistanceLayer);
bool L2DistanceLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
CHECK_EQ(inputLayers_.size(), 2UL) << "The L2DistanceLayer accepts two and "
<< "only two inputs.";
CHECK_EQ(getSize(), 1UL) << "The output dimensionality of L2DistanceLayer "
<< "is fixed to be 1.";
return true;
}
void L2DistanceLayer::forward(PassType passType) {
Layer::forward(passType);
const auto inV1 = getInputValue(0);
const auto inV2 = getInputValue(1);
CHECK(inV1 && inV2);
CHECK_EQ(inV1->getHeight(), inV2->getHeight())
<< "The height of two inputs of this layer must be the same.";
CHECK_EQ(inV1->getWidth(), inV2->getWidth())
<< "The width of two inputs of this layer must be the same.";
int batchSize = inV1->getHeight();
int output_dim = getSize();
{
REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
reserveOutput(batchSize, output_dim);
auto outV = getOutputValue();
CHECK(outV) << "The output matrix should not be null.";
Matrix::resizeOrCreate(
inputSub_, inV1->getHeight(), inV1->getWidth(), false, useGpu_);
inputSub_->assign(*inV1);
inputSub_->sub(*inV2);
outV->sumOfProducts(*inputSub_, *inputSub_, 1, 0);
outV->sqrt2(*outV);
}
}
void L2DistanceLayer::backward(const UpdateCallback& callback) {
const auto outG = getOutputGrad();
const auto outV = getOutputValue();
CHECK(outG && outV);
auto inGrad1 = getInputGrad(0);
auto inGrad2 = getInputGrad(1);
{
REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
if (inGrad1 || inGrad2) {
outV->scalarDiv(*outV, 1.);
outV->dotMul(*outG, *outV);
}
if (inGrad1) inGrad1->addRowScale(0, *inputSub_, *outV);
if (inGrad2) {
inputSub_->mulScalar(-1.);
inGrad2->addRowScale(0, *inputSub_, *outV);
}
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Layer.h"
#include "paddle/math/Matrix.h"
namespace paddle {
/**
* @brief The layer calculates the l2 distance between two input vectors.
* \f[
* f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
* \f]
*
* - Input1: A vector (batchSize * dataDim)
* - Input2: A vector (batchSize * dataDim)
* - Output: A vector (batchSize * 1)
*
* The configuration api is: l2_distance_layer.
*/
class L2DistanceLayer : public Layer {
public:
explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
~L2DistanceLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
private:
// Store the result of subtracting Input2 from Input1 in forward computation,
// which will be reused in backward computation.
MatrixPtr inputSub_;
};
} // namespace paddle
...@@ -98,6 +98,7 @@ ClassRegistrar<Layer, LayerConfig> Layer::registrar_; ...@@ -98,6 +98,7 @@ ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
LayerPtr Layer::create(const LayerConfig& config) { LayerPtr Layer::create(const LayerConfig& config) {
std::string type = config.type(); std::string type = config.type();
#ifndef PADDLE_MOBILE_INFERENCE
// NOTE: As following types have illegal character '-', // NOTE: As following types have illegal character '-',
// they can not use REGISTER_LAYER to registrar. // they can not use REGISTER_LAYER to registrar.
// Besides, to fit with old training models, // Besides, to fit with old training models,
...@@ -106,7 +107,6 @@ LayerPtr Layer::create(const LayerConfig& config) { ...@@ -106,7 +107,6 @@ LayerPtr Layer::create(const LayerConfig& config) {
return LayerPtr(new MultiClassCrossEntropy(config)); return LayerPtr(new MultiClassCrossEntropy(config));
else if (type == "rank-cost") else if (type == "rank-cost")
return LayerPtr(new RankingCost(config)); return LayerPtr(new RankingCost(config));
#ifndef PADDLE_MOBILE_INFERENCE
else if (type == "auc-validation") else if (type == "auc-validation")
return LayerPtr(new AucValidation(config)); return LayerPtr(new AucValidation(config));
else if (type == "pnpair-validation") else if (type == "pnpair-validation")
......
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MKLDNNConcatLayer.h"
using namespace mkldnn; // NOLINT
typedef memory::format format;
namespace paddle {
REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer);
bool MKLDNNConcatLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
if (!MKLDNNLayer::init(layerMap, parameterMap)) {
return false;
}
CHECK_GT(inputLayers_.size(), 1UL);
CHECK(!biasParameter_);
return true;
}
void MKLDNNConcatLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw);
ic = inputLayers_[0]->getSize() / ih / iw;
CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
CHECK_GT(inputLayers_.size(), 1UL);
channels_.resize(inputLayers_.size());
channels_[0] = ic;
// need change the output channel, so use oc_ instead
// TODO(TJ): change API, use &oc
oc_ = ic;
for (size_t i = 1; i < inputLayers_.size(); i++) {
int batchsize, height, witdh;
reshapeInput(batchsize, height, witdh, i);
CHECK_EQ(bs, batchsize);
CHECK_EQ(ih, height);
CHECK_EQ(iw, witdh);
channels_[i] = inputLayers_[i]->getSize() / height / witdh;
CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize());
oc_ += channels_[i];
}
oh = ih;
ow = iw;
reshapeOutput(oh, ow);
resizeOutput(bs, oc_ * oh * ow);
}
void MKLDNNConcatLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
resetFwdBuffers(inVals_, out);
in = inVals_[0];
std::shared_ptr<concat::primitive_desc> fwdPD;
resetFwdPD(fwdPD, inVals_, out);
resetFwdPipeline(pipeline, fwdPD, inVals_, out);
}
void MKLDNNConcatLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
resetBwdBuffers(inGrads_, out);
in = inGrads_[0];
resetBwdPipeline(pipeline, bwds_, inGrads_, out);
}
void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
inputs.resize(inputLayers_.size());
bool has8c = false, has16c = false, hasnc = false;
for (size_t i = 0; i < inputs.size(); i++) {
// resetInValue will use ic_ so temporary change as current input's channel
// TODO(TJ): change ic_ as vector then can remove channels_
ic_ = channels_[i];
resetInValue(inputs[i], nullptr, i);
CHECK(inputs[i]);
auto dm = inputs[i]->getDims();
// inputs format can be different, but ndims must equal
CHECK(i == 0 || dm.size() == inputs[0]->getDims().size());
CHECK_EQ(bs_, dm[0]);
CHECK_EQ(channels_[i], dm[1]);
if (dm.size() > 2) {
CHECK_EQ(ih_, dm[2]);
CHECK_EQ(iw_, dm[3]);
}
if (inputs[i]->getFormat() == format::nc) {
hasnc = true;
}
if (inputs[i]->getFormat() == format::nChw8c) {
has8c = true;
}
if (inputs[i]->getFormat() == format::nChw16c) {
has16c = true;
}
}
// change back, ic_ always save the input 0 size
ic_ = channels_[0];
format outFmt;
if (has16c && oc_ % 16 == 0) {
outFmt = format::nChw16c;
} else if (has8c && oc_ % 8 == 0) {
outFmt = format::nChw8c;
} else if (hasnc) {
CHECK(oh_ == 1 && ow_ == 1);
outFmt = format::nc;
} else {
outFmt = format::nchw;
}
memory::dims outDims =
hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_};
auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_);
resetOutValue(out, outPD);
}
void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr<concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr out) {
std::vector<memory::primitive_desc> srcPDs;
for (size_t i = 0; i < inputs.size(); i++) {
srcPDs.push_back(inputs[i]->getPrimitiveDesc());
}
CHECK(out);
pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs));
CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
}
void MKLDNNConcatLayer::resetFwdPipeline(
std::vector<primitive>& pipeline,
std::shared_ptr<concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
std::vector<primitive::at> srcs;
for (size_t i = 0; i < inputs.size(); i++) {
srcs.push_back(*(inputs[i]));
}
fwd_.reset(new concat(*pd, srcs, *out));
pipeline.push_back(*fwd_);
}
void MKLDNNConcatLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
CHECK(outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
CHECK(out);
inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) {
CHECK(inVals_[i]);
// resetInGrad will use inVal_
// TODO(TJ): change move inVals_ to MKLDNNLayer ans remove inVal_
inVal_ = inVals_[i];
resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc());
}
// change back, inVal_ always save the input 0
inVal_ = inVals_[0];
}
void MKLDNNConcatLayer::resetBwdPipeline(
std::vector<mkldnn::primitive>& pipeline,
std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
// reset the backward primitives
memory::dims offsets = {0, 0, 0, 0};
prims.resize(inputs.size());
CHECK_EQ(inputs.size(), channels_.size());
for (size_t i = 0; i < inputs.size(); i++) {
auto viewPD = view::primitive_desc(
out->getPrimitiveDesc(), inputs[i]->getDims(), offsets);
auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(),
inputs[i]->getPrimitiveDesc());
prims[i].reset(new reorder(bwdPD, *out, *(inputs[i])));
offsets[axis_] += channels_[i];
// push to pipeline
pipeline.push_back(*prims[i]);
}
}
} // namespace paddle
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "MKLDNNLayer.h"
#include "mkldnn.hpp"
namespace paddle {
/**
* @brief A subclass of MKLDNNLayer Concatenate layer.
*
* The config file api is mkldnn_concat
*/
class MKLDNNConcatLayer : public MKLDNNLayer {
protected:
std::vector<MKLDNNMatrixPtr> inVals_;
std::vector<MKLDNNMatrixPtr> inGrads_;
std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
// input channel numbers
std::vector<int> channels_;
// concat_dimension in MKLDNN
// if axis_ == 0, concat batchsize
// if axis_ == 1, concat channel (default)
int axis_;
public:
explicit MKLDNNConcatLayer(const LayerConfig& config)
: MKLDNNLayer(config), axis_(1) {}
~MKLDNNConcatLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void printSizeInfo() override {
CHECK_EQ(channels_.size(), inputLayers_.size());
for (size_t i = 0; i < channels_.size(); ++i) {
VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
<< ": " << bs_ << ", " << channels_[i] << ", " << ih_
<< ", " << iw_;
}
VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
<< ", " << ow_;
}
void printValueFormat() override {
for (size_t i = 0; i < inVals_.size(); ++i) {
VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
<< ": " << inVals_[i]->getFormat() << " >>>";
}
if (outVal_) {
VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
}
if (extOutVal_) {
VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
}
}
void printGradFormat() override {
if (extOutGrad_) {
VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
}
if (outGrad_) {
VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
}
for (size_t i = 0; i < inGrads_.size(); ++i) {
VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
<< ": " << inGrads_[i]->getFormat() << "<<<";
}
}
protected:
/**
* Forward functions: reset buffers(inputs, output, bias),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr out);
void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(inputs, output, bias)
* reset primitives and pipeline
*/
void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
};
} // namespace paddle
...@@ -21,8 +21,8 @@ namespace paddle { ...@@ -21,8 +21,8 @@ namespace paddle {
bool MKLDNNLayer::init(const LayerMap& layerMap, bool MKLDNNLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) { const ParameterMap& parameterMap) {
CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn." CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn."
<< "Please set WITH_MKLDNN=ON " << "Please set WITH_MKL=ON "
<< "and set use_mkldnn=True"; << "and set use_mkldnn=True";
CHECK(!useGpu_) << "Do not support GPU yet"; CHECK(!useGpu_) << "Do not support GPU yet";
...@@ -138,8 +138,11 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) { ...@@ -138,8 +138,11 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
} }
} }
void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) { void MKLDNNLayer::reshapeInput(int& batchsize,
const Argument& input = inputLayers_[0]->getOutput(); int& height,
int& width,
size_t inputIdx) {
const Argument& input = inputLayers_[inputIdx]->getOutput();
batchsize = input.getBatchSize(); batchsize = input.getBatchSize();
int h = input.getFrameHeight(); int h = input.getFrameHeight();
int w = input.getFrameWidth(); int w = input.getFrameWidth();
......
...@@ -178,7 +178,10 @@ protected: ...@@ -178,7 +178,10 @@ protected:
/** /**
* reshape the input image sizes and input batchsize * reshape the input image sizes and input batchsize
*/ */
void reshapeInput(int& batchsize, int& height, int& width); void reshapeInput(int& batchsize,
int& height,
int& width,
size_t inputIdx = 0);
/** /**
* reshape output image sizes * reshape output image sizes
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MaxPoolWithMaskLayer.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
bool MaxPoolWithMaskLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
PoolLayer::init(layerMap, parameterMap);
setOutput("mask", &mask_);
return true;
}
size_t MaxPoolWithMaskLayer::getSize() {
CHECK_EQ(inputLayers_.size(), 1UL);
size_t layerSize = 0;
outputY_ = outputSize(imgSizeY_,
sizeY_,
confPaddingY_,
strideY_,
/* caffeMode */ false);
outputX_ = outputSize(imgSize_,
sizeX_,
confPadding_,
stride_,
/* caffeMode */ false);
layerSize = outputX_ * outputY_ * channels_;
getOutput().setFrameHeight(outputY_);
getOutput().setFrameWidth(outputX_);
return layerSize;
}
void MaxPoolWithMaskLayer::forward(PassType passType) {
size_t size = getSize();
MatrixPtr inputV = inputLayers_[0]->getOutputValue();
int batchSize = inputV->getHeight();
resetOutput(batchSize, size);
MatrixPtr outV = getOutputValue();
CHECK_EQ(size, outV->getWidth());
resetSpecifyOutput(mask_,
batchSize,
size,
/* isValueClean */ false,
/* isGradClean */ true);
MatrixPtr maskV = mask_.value;
outV->maxPoolForward(*inputV,
imgSizeY_,
imgSize_,
channels_,
sizeX_,
sizeY_,
strideY_,
stride_,
outputY_,
outputX_,
confPaddingY_,
confPadding_,
maskV);
}
void MaxPoolWithMaskLayer::backward(const UpdateCallback& callback) {
(void)callback;
if (NULL == getInputGrad(0)) {
return;
}
MatrixPtr outGrad = getOutputGrad();
MatrixPtr inputV = inputLayers_[0]->getOutputValue();
MatrixPtr outV = getOutputValue();
MatrixPtr inputGrad = inputLayers_[0]->getOutputGrad();
inputGrad->maxPoolBackward(*inputV,
imgSizeY_,
imgSize_,
*outGrad,
*outV,
sizeX_,
sizeY_,
strideY_,
stride_,
outputY_,
outputX_,
1,
1,
confPaddingY_,
confPadding_);
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "PoolLayer.h"
#include "paddle/math/Matrix.h"
namespace paddle {
/**
* @brief Basic parent layer of different kinds of pooling
*/
class MaxPoolWithMaskLayer : public PoolLayer {
protected:
Argument mask_;
public:
explicit MaxPoolWithMaskLayer(const LayerConfig& config)
: PoolLayer(config) {}
size_t getSize();
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
};
} // namespace paddle
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "PoolLayer.h" #include "PoolLayer.h"
#include "MaxPoolWithMaskLayer.h"
#include "PoolProjectionLayer.h" #include "PoolProjectionLayer.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -44,7 +45,6 @@ bool PoolLayer::init(const LayerMap& layerMap, ...@@ -44,7 +45,6 @@ bool PoolLayer::init(const LayerMap& layerMap,
strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride(); strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding(); confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x(); outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
return true; return true;
} }
...@@ -57,6 +57,8 @@ Layer* PoolLayer::create(const LayerConfig& config) { ...@@ -57,6 +57,8 @@ Layer* PoolLayer::create(const LayerConfig& config) {
} else if (CudnnPoolLayer::typeCheck(pool)) { } else if (CudnnPoolLayer::typeCheck(pool)) {
return new CudnnPoolLayer(config); return new CudnnPoolLayer(config);
#endif #endif
} else if (pool == "max-pool-with-mask") {
return new MaxPoolWithMaskLayer(config);
} else { } else {
LOG(FATAL) << "Unknown pool type: " << pool; LOG(FATAL) << "Unknown pool type: " << pool;
return nullptr; return nullptr;
......
...@@ -100,8 +100,9 @@ void ROIPoolLayer::forward(PassType passType) { ...@@ -100,8 +100,9 @@ void ROIPoolLayer::forward(PassType passType) {
size_t roiEndH = round(bottomROIs[4] * spatialScale_); size_t roiEndH = round(bottomROIs[4] * spatialScale_);
CHECK_GE(roiBatchIdx, 0UL); CHECK_GE(roiBatchIdx, 0UL);
CHECK_LT(roiBatchIdx, batchSize); CHECK_LT(roiBatchIdx, batchSize);
size_t roiHeight = std::max(roiEndH - roiStartH + 1, 1UL); size_t roiHeight =
size_t roiWidth = std::max(roiEndW - roiStartW + 1, 1UL); std::max(roiEndH - roiStartH + 1, static_cast<size_t>(1));
size_t roiWidth = std::max(roiEndW - roiStartW + 1, static_cast<size_t>(1));
real binSizeH = real binSizeH =
static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_); static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_);
real binSizeW = real binSizeW =
...@@ -114,10 +115,14 @@ void ROIPoolLayer::forward(PassType passType) { ...@@ -114,10 +115,14 @@ void ROIPoolLayer::forward(PassType passType) {
size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW)); size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW));
size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH)); size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH));
size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW)); size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW));
hstart = std::min(std::max(hstart + roiStartH, 0UL), height_); hstart = std::min(
wstart = std::min(std::max(wstart + roiStartW, 0UL), width_); std::max(hstart + roiStartH, static_cast<size_t>(0)), height_);
hend = std::min(std::max(hend + roiStartH, 0UL), height_); wstart = std::min(
wend = std::min(std::max(wend + roiStartW, 0UL), width_); std::max(wstart + roiStartW, static_cast<size_t>(0)), width_);
hend = std::min(std::max(hend + roiStartH, static_cast<size_t>(0)),
height_);
wend = std::min(std::max(wend + roiStartW, static_cast<size_t>(0)),
width_);
bool isEmpty = (hend <= hstart) || (wend <= wstart); bool isEmpty = (hend <= hstart) || (wend <= wstart);
size_t poolIndex = ph * pooledWidth_ + pw; size_t poolIndex = ph * pooledWidth_ + pw;
......
# gserver pacakge unittests # gserver pacakge unittests
add_simple_unittest(test_LinearChainCRF) add_simple_unittest(test_LinearChainCRF)
add_simple_unittest(test_MultinomialSampler)
add_simple_unittest(test_RecurrentLayer) add_simple_unittest(test_RecurrentLayer)
if(NOT MOBILE_INFERENCE)
add_simple_unittest(test_MultinomialSampler)
endif()
function(gserver_test TARGET) function(gserver_test TARGET)
add_unittest_without_exec(${TARGET} add_unittest_without_exec(${TARGET}
${TARGET}.cpp ${TARGET}.cpp
...@@ -24,8 +27,9 @@ gserver_test(test_ConvUnify) ...@@ -24,8 +27,9 @@ gserver_test(test_ConvUnify)
gserver_test(test_BatchNorm) gserver_test(test_BatchNorm)
gserver_test(test_KmaxSeqScore) gserver_test(test_KmaxSeqScore)
gserver_test(test_Expand) gserver_test(test_Expand)
gserver_test(test_MaxPoolingWithMaskOutput)
########## test_Mkldnn layers and activations ########## ########## test_MKLDNN layers and activations ##########
if(WITH_MKLDNN) if(WITH_MKLDNN)
add_unittest_without_exec(test_MKLDNN add_unittest_without_exec(test_MKLDNN
test_MKLDNN.cpp test_MKLDNN.cpp
...@@ -48,7 +52,7 @@ if(WITH_PYTHON) ...@@ -48,7 +52,7 @@ if(WITH_PYTHON)
endif() endif()
############### test_WarpCTCLayer ####################### ############### test_WarpCTCLayer #######################
if(NOT WITH_DOUBLE) if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
add_unittest_without_exec(test_WarpCTCLayer add_unittest_without_exec(test_WarpCTCLayer
test_WarpCTCLayer.cpp) test_WarpCTCLayer.cpp)
...@@ -58,17 +62,6 @@ if(NOT WITH_DOUBLE) ...@@ -58,17 +62,6 @@ if(NOT WITH_DOUBLE)
endif() endif()
if(NOT MOBILE_INFERENCE) if(NOT MOBILE_INFERENCE)
################### test_ProtoDataProvider ############
add_unittest_without_exec(test_ProtoDataProvider
test_ProtoDataProvider.cpp)
# test_ProtoDataProvider will mkdir as same name,
# so if WORKING_DIRECTORY is default directory, then
# mkdir will get error.
add_test(NAME test_ProtoDataProvider
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
################## test_Evaluator ####################### ################## test_Evaluator #######################
add_unittest(test_Evaluator add_unittest(test_Evaluator
test_Evaluator.cpp) test_Evaluator.cpp)
...@@ -106,3 +99,24 @@ add_test(NAME test_PyDataProvider2 ...@@ -106,3 +99,24 @@ add_test(NAME test_PyDataProvider2
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2 COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
) )
################# test_CompareSparse ##################
add_unittest_without_exec(test_CompareSparse
test_CompareSparse.cpp)
if(NOT ON_TRAVIS)
add_test(NAME test_CompareSparse
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
./.set_port.sh -p port -n 6
${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif()
################ test_CompareTwoNets ######################
add_unittest_without_exec(test_CompareTwoNets
test_CompareTwoNets.cpp)
add_test(NAME test_CompareTwoNets
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
...@@ -23,7 +23,7 @@ limitations under the License. */ ...@@ -23,7 +23,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
/** /**
* @brief test the functionality of Mkldnnlayers * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
* refer to paddle original function * refer to paddle original function
*/ */
class MKLDNNTester { class MKLDNNTester {
......
./test_ProtoDataProvider/data1.bin
./test_ProtoDataProvider/data2.bin
./test_ProtoDataProvider/data1.bin.gz
./test_ProtoDataProvider/data2.bin.gz
#!/usr/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
...@@ -14,27 +15,50 @@ ...@@ -14,27 +15,50 @@
from paddle.trainer_config_helpers import * from paddle.trainer_config_helpers import *
################################### Data Configuration ################################### ######################## data source ################################
TrainData(ProtoData(files = "trainer/tests/mnist.list")) dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
################################### Algorithm Configuration ################################### dict_file = dict()
settings(batch_size = 1000, for line_count, line in enumerate(open(dict_path, "r")):
learning_method = MomentumOptimizer(momentum=0.5, sparse=False)) dict_file[line.strip()] = line_count
################################### Network Configuration ###################################
data = data_layer(name ="input", size=784)
fc1 = fc_layer(input=data, size=800, define_py_data_sources2(
bias_attr=True, train_list='gserver/tests/Sequence/train.list',
act=SigmoidActivation()) test_list=None,
module='sequenceGen',
obj='process',
args={"dict_file": dict_file})
fc2 = fc_layer(input=fc1, size=800, settings(batch_size=5)
bias_attr=True, ######################## network configure ################################
act=SigmoidActivation()) dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128
hidden_dim = 256
label_dim = 3
sparse_update = get_config_arg("sparse_update", bool, False)
output = fc_layer(input=[fc1, fc2], size=10, data = data_layer(name="word", size=dict_dim)
bias_attr=True,
act=SoftmaxActivation())
lbl = data_layer(name ="label", size=1) emb = embedding_layer(
input=data,
size=word_dim,
param_attr=ParamAttr(sparse_update=sparse_update))
cost = classification_cost(input=output, label=lbl) with mixed_layer(size=hidden_dim * 4) as lstm_input:
outputs(cost) lstm_input += full_matrix_projection(input=emb)
lstm = lstmemory(
input=lstm_input,
act=TanhActivation(),
gate_act=SigmoidActivation(),
state_act=TanhActivation())
lstm_last = last_seq(input=lstm)
with mixed_layer(
size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
output += full_matrix_projection(input=lstm_last)
outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
#!/usr/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
...@@ -14,27 +15,42 @@ ...@@ -14,27 +15,42 @@
from paddle.trainer_config_helpers import * from paddle.trainer_config_helpers import *
################################### Data Configuration ################################### ######################## data source ################################
TrainData(ProtoData(files = "trainer/tests/mnist.list")) dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
################################### Algorithm Configuration ################################### dict_file = dict()
settings(batch_size = 1000, for line_count, line in enumerate(open(dict_path, "r")):
learning_method = MomentumOptimizer(momentum=0.5, sparse=False)) dict_file[line.strip()] = line_count
################################### Network Configuration ###################################
data = data_layer(name ="input", size=784)
fc1 = fc_layer(input=data, size=800, define_py_data_sources2(
bias_attr=True, train_list='gserver/tests/Sequence/train.list',
act=SigmoidActivation()) test_list=None,
module='sequenceGen',
obj='process',
args={"dict_file": dict_file})
fc2 = fc_layer(input=fc1, size=800, settings(batch_size=5)
bias_attr=True, ######################## network configure ################################
act=SigmoidActivation()) dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128
hidden_dim = 128
label_dim = 3
output = fc_layer(input=[fc1, fc2], size=10, # This config is designed to be equivalent with sequence_recurrent_group.py
bias_attr=True,
act=SoftmaxActivation())
lbl = data_layer(name ="label", size=1) data = data_layer(name="word", size=dict_dim)
cost = classification_cost(input=output, label=lbl) emb = embedding_layer(
outputs(cost) input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
recurrent_last = last_seq(input=recurrent)
with mixed_layer(
size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
output += full_matrix_projection(input=recurrent_last)
outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
#!/usr/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
######################## data source ################################
dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
dict_file = dict()
for line_count, line in enumerate(open(dict_path, "r")):
dict_file[line.strip()] = line_count
define_py_data_sources2(
train_list='gserver/tests/Sequence/train.list',
test_list=None,
module='sequenceGen',
obj='process',
args={"dict_file": dict_file})
settings(batch_size=5)
######################## network configure ################################
dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128
hidden_dim = 128
label_dim = 3
# This config is designed to be equivalent with sequence_recurrent.py
data = data_layer(name="word", size=dict_dim)
emb = embedding_layer(
input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
def step(y):
mem = memory(name="rnn_state", size=hidden_dim)
with mixed_layer(
name="rnn_state",
size=hidden_dim,
bias_attr=False,
act=SoftmaxActivation()) as out:
out += identity_projection(input=y)
out += full_matrix_projection(
input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
return out
recurrent = recurrent_group(name="rnn", step=step, input=emb)
recurrent_last = last_seq(input=recurrent)
with mixed_layer(
size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
output += full_matrix_projection(input=recurrent_last)
outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
...@@ -22,8 +22,7 @@ limitations under the License. */ ...@@ -22,8 +22,7 @@ limitations under the License. */
using namespace paddle; // NOLINT using namespace paddle; // NOLINT
using namespace std; // NOLINT using namespace std; // NOLINT
static const string& configFile1 = static const string& configFile1 = "gserver/tests/sequence_lstm.conf";
"trainer/tests/sample_trainer_config_compare_sparse.conf";
DECLARE_bool(use_gpu); DECLARE_bool(use_gpu);
DECLARE_string(config); DECLARE_string(config);
......
...@@ -30,8 +30,6 @@ DECLARE_bool(use_gpu); ...@@ -30,8 +30,6 @@ DECLARE_bool(use_gpu);
DECLARE_string(config); DECLARE_string(config);
DECLARE_string(nics); DECLARE_string(nics);
DEFINE_string(config_file_a, "", "config of one network to compare");
DEFINE_string(config_file_b, "", "config of another network to compare");
DEFINE_bool(need_high_accuracy, DEFINE_bool(need_high_accuracy,
false, false,
"whether need to run in double accuracy"); "whether need to run in double accuracy");
...@@ -42,6 +40,10 @@ DEFINE_double( ...@@ -42,6 +40,10 @@ DEFINE_double(
DECLARE_bool(thread_local_rand_use_global_seed); DECLARE_bool(thread_local_rand_use_global_seed);
DECLARE_int32(seed); DECLARE_int32(seed);
static const string& config_file_a = "gserver/tests/sequence_recurrent.py";
static const string& config_file_b =
"gserver/tests/sequence_recurrent_group.py";
struct ComData { struct ComData {
vector<Argument> outArgs; vector<Argument> outArgs;
vector<ParameterPtr> parameters; vector<ParameterPtr> parameters;
...@@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) { ...@@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) {
DataBatch dataBatch; DataBatch dataBatch;
int32_t batchSize = trainer.getConfig().opt_config().batch_size(); int32_t batchSize = trainer.getConfig().opt_config().batch_size();
trainer.getDataProvider()->reset();
trainer.getDataProvider()->setSkipShuffle(); trainer.getDataProvider()->setSkipShuffle();
trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch); trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
...@@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) { ...@@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
TEST(Trainer, create) { TEST(Trainer, create) {
ComData dataA; ComData dataA;
calcGradient(dataA, FLAGS_config_file_a); calcGradient(dataA, config_file_a);
LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n"; LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
ComData dataB; ComData dataB;
calcGradient(dataB, FLAGS_config_file_b); calcGradient(dataB, config_file_b);
LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n"; LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
compareGradient(dataA, dataB); compareGradient(dataA, dataB);
......
...@@ -434,7 +434,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) { ...@@ -434,7 +434,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
config.layerConfig.set_partial_sum(1); config.layerConfig.set_partial_sum(1);
config.layerConfig.set_shared_biases(true); config.layerConfig.set_shared_biases(true);
int dilation = 1; int dilation = 2;
if (type == "cudnn_conv") { if (type == "cudnn_conv") {
#if CUDNN_VERSION >= 6000 #if CUDNN_VERSION >= 6000
dilation = 2; dilation = 2;
...@@ -583,6 +583,7 @@ TEST(Layer, maxoutLayer) { ...@@ -583,6 +583,7 @@ TEST(Layer, maxoutLayer) {
testLayerGrad(config, "maxout", 10, false, useGpu); testLayerGrad(config, "maxout", 10, false, useGpu);
} }
} }
void testFcLayer(string format, size_t nnz) { void testFcLayer(string format, size_t nnz) {
TestConfig config; TestConfig config;
config.biasSize = 1024; config.biasSize = 1024;
...@@ -1081,6 +1082,21 @@ TEST(Layer, InterpolationLayer) { ...@@ -1081,6 +1082,21 @@ TEST(Layer, InterpolationLayer) {
} }
} }
TEST(Layer, DotProdLayer) {
TestConfig config;
config.layerConfig.set_type("dot_prod");
config.layerConfig.set_size(1);
config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
config.layerConfig.add_inputs();
config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
testLayerGrad(config, "dot_prod", 10, false, useGpu);
}
}
TEST(Layer, OuterProdLayer) { TEST(Layer, OuterProdLayer) {
TestConfig config; TestConfig config;
config.layerConfig.set_type("out_prod"); config.layerConfig.set_type("out_prod");
...@@ -1234,6 +1250,7 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) { ...@@ -1234,6 +1250,7 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
TEST(Layer, PoolLayer) { TEST(Layer, PoolLayer) {
testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false); testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false); testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true); testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
...@@ -1242,6 +1259,7 @@ TEST(Layer, PoolLayer) { ...@@ -1242,6 +1259,7 @@ TEST(Layer, PoolLayer) {
testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
#endif #endif
} }
...@@ -2427,6 +2445,25 @@ TEST(Layer, ScaleSubRegionLayer) { ...@@ -2427,6 +2445,25 @@ TEST(Layer, ScaleSubRegionLayer) {
} }
} }
TEST(Layer, L2DistanceLayer) {
TestConfig config;
config.layerConfig.set_type("l2_distance");
config.layerConfig.set_size(1);
config.biasSize = 0;
const size_t input_dim = 27;
const size_t batch_size = 11;
config.inputDefs.push_back({INPUT_DATA, "layer_0", input_dim, 0});
config.inputDefs.push_back({INPUT_DATA, "layer_1", input_dim, 0});
config.layerConfig.add_inputs();
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
testLayerGrad(config, "l2_distance", batch_size, false, useGpu);
}
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
initMain(argc, argv); initMain(argc, argv);
......
...@@ -313,6 +313,47 @@ TEST(MKLDNNLayer, AddtoLayer) { ...@@ -313,6 +313,47 @@ TEST(MKLDNNLayer, AddtoLayer) {
testAddtoLayer({4, 12, 1, 1}, 3); testAddtoLayer({4, 12, 1, 1}, 3);
} }
static void getMKLDNNConcatConfig(TestConfig& cfg,
const std::vector<testImageDesc>& inputs) {
CHECK_GE(inputs.size(), 2) << "at least two inputs";
int oc = inputs[0].ic;
for (size_t i = 1; i < inputs.size(); ++i) {
CHECK_EQ(inputs[i].bs, inputs[0].bs);
CHECK_EQ(inputs[i].ih, inputs[0].ih);
CHECK_EQ(inputs[i].iw, inputs[0].iw);
oc += inputs[i].ic;
}
cfg.biasSize = 0;
cfg.layerConfig.set_type("mkldnn_concat");
cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
cfg.layerConfig.set_active_type("relu");
for (size_t i = 0; i < inputs.size(); ++i) {
std::stringstream ss;
ss << "layer_" << i;
cfg.inputDefs.push_back(
{INPUT_DATA,
ss.str(),
(size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
0});
LayerInputConfig* input = cfg.layerConfig.add_inputs();
ImageConfig* img_conf = input->mutable_image_conf();
img_conf->set_channels(inputs[i].ic);
img_conf->set_img_size_y(inputs[i].ih);
img_conf->set_img_size(inputs[i].iw);
}
}
void testConcatLayer(const std::vector<testImageDesc>& inputs) {
TestConfig dnnConfig;
getMKLDNNConcatConfig(dnnConfig, inputs);
RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
}
TEST(MKLDNNLayer, ConcatLayer) {
testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
}
void testActivation(std::string actType, const testImageDesc& pm) { void testActivation(std::string actType, const testImageDesc& pm) {
// TODO(TJ): remove me when paddle support elu activation // TODO(TJ): remove me when paddle support elu activation
if (actType == "mkldnn_elu") { if (actType == "mkldnn_elu") {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <string>
#include <vector>
#include "LayerGradUtil.h"
#include "paddle/math/MathUtils.h"
#include "paddle/testing/TestUtil.h"
using namespace paddle;
void setPoolConfig(TestConfig* config,
PoolConfig* pool,
const string& poolType) {
(*config).biasSize = 0;
(*config).layerConfig.set_type("pool");
(*config).layerConfig.set_num_filters(1);
int kw = 3, kh = 3;
int pw = 0, ph = 0;
int sw = 2, sh = 2;
pool->set_pool_type(poolType);
pool->set_channels(1);
pool->set_size_x(kw);
pool->set_size_y(kh);
pool->set_start(0);
pool->set_padding(pw);
pool->set_padding_y(ph);
pool->set_stride(sw);
pool->set_stride_y(sh);
int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
pool->set_output_x(ow);
pool->set_output_y(oh);
}
void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
const string& poolType,
bool use_gpu,
MatrixPtr& maskMat) {
TestConfig config;
config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
LayerInputConfig* input = config.layerConfig.add_inputs();
PoolConfig* pool = input->mutable_pool_conf();
pool->set_img_size(5);
pool->set_img_size_y(5);
setPoolConfig(&config, pool, poolType);
config.layerConfig.set_size(pool->output_x() * pool->output_y() *
pool->channels());
config.layerConfig.set_name("MaxPoolWithMask");
std::vector<DataLayerPtr> dataLayers;
LayerMap layerMap;
vector<Argument> datas;
initDataLayer(config,
&dataLayers,
&datas,
&layerMap,
"MaxPoolWithMask",
1,
false,
use_gpu);
dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
FLAGS_use_gpu = use_gpu;
std::vector<ParameterPtr> parameters;
LayerPtr maxPoolingWithMaskOutputLayer;
initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
maxPoolingWithMaskOutputLayer->forward(PASS_GC);
checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
maskMat);
}
TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
bool useGpu = false;
MatrixPtr inputMat;
MatrixPtr maskMat;
real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
real maskData[] = {12, 4, 22, 24};
inputMat = Matrix::create(1, 25, false, useGpu);
maskMat = Matrix::create(1, 4, false, useGpu);
inputMat->setData(inputData);
maskMat->setData(maskData);
doOneMaxPoolingWithMaskOutputTest(
inputMat, "max-pool-with-mask", useGpu, maskMat);
#ifdef PADDLE_WITH_CUDA
useGpu = true;
inputMat = Matrix::create(1, 25, false, useGpu);
maskMat = Matrix::create(1, 4, false, useGpu);
inputMat->copyFrom(inputData, 25);
maskMat->copyFrom(maskData, 4);
doOneMaxPoolingWithMaskOutputTest(
inputMat, "max-pool-with-mask", useGpu, maskMat);
#endif
}
...@@ -1902,5 +1902,52 @@ void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b, ...@@ -1902,5 +1902,52 @@ void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
} }
template class BaseMatrixT<real>; template class BaseMatrixT<real>;
#ifndef PADDLE_MOBILE_INFERENCE
template class BaseMatrixT<int>; template class BaseMatrixT<int>;
#else
template <>
void BaseMatrixT<int>::zero() {
applyUnary(unary::Zero<int>());
}
template <>
void BaseMatrixT<int>::assign(int p) {
applyUnary(unary::Assign<int>(p));
}
template <>
void BaseMatrixT<int>::isEqualTo(BaseMatrixT& b, int value) {
applyBinary(binary::IsEqual<int>(value), b);
}
template <>
void BaseMatrixT<int>::neg() {
applyUnary(unary::Neg<int>());
}
template <>
void BaseMatrixT<int>::abs2() {
applyUnary(unary::Abs<int>());
}
template <>
void BaseMatrixT<int>::add(int p) {
applyUnary(unary::Add<int>(p));
}
template <>
void BaseMatrixT<int>::add(int p1, int p2) {
applyUnary(unary::Add2<int>(p1, p2));
}
template <>
void BaseMatrixT<int>::applyL1(int learningRate, int decayRate) {
applyUnary(unary::ApplyL1<int>(learningRate * decayRate));
}
#endif
} // namespace paddle } // namespace paddle
...@@ -25,6 +25,19 @@ else() ...@@ -25,6 +25,19 @@ else()
message(STATUS "Compile with MKLDNNMatrix") message(STATUS "Compile with MKLDNNMatrix")
endif() endif()
if(MOBILE_INFERENCE)
list(REMOVE_ITEM MATH_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/SIMDFunctions.cpp)
# Remove sparse
list(REMOVE_ITEM MATH_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
list(REMOVE_ITEM MATH_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
endif()
set(MATH_SOURCES set(MATH_SOURCES
"${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu" "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
"${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu" "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -17,9 +17,13 @@ limitations under the License. */ ...@@ -17,9 +17,13 @@ limitations under the License. */
#include "paddle/utils/StringUtil.h" #include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h" #include "paddle/utils/Util.h"
#ifndef PADDLE_MOBILE_INFERENCE
DEFINE_int32(pool_limit_size, DEFINE_int32(pool_limit_size,
536870912, 536870912,
"maximum memory size managed by a memory pool, default is 512M"); "maximum memory size managed by a memory pool, default is 512M");
#else
DEFINE_int32(pool_limit_size, 0, "default is 0");
#endif
namespace paddle { namespace paddle {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册