提交 5ee63bb6 编写于 作者: W wangmeng28

Merge remote-tracking branch 'upstream/develop' into factorization_machine_layer

......@@ -36,8 +36,7 @@ include(simd)
################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND})
option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
......@@ -82,10 +81,8 @@ if(ANDROID OR IOS)
"Disable PYTHON when cross-compiling for Android and iOS" FORCE)
set(WITH_RDMA OFF CACHE STRING
"Disable RDMA when cross-compiling for Android and iOS" FORCE)
set(WITH_MKLDNN OFF CACHE STRING
"Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
set(WITH_MKLML OFF CACHE STRING
"Disable MKLML package when cross-compiling for Android and iOS" FORCE)
set(WITH_MKL OFF CACHE STRING
"Disable MKL when cross-compiling for Android and iOS" FORCE)
# Compile PaddlePaddle mobile inference library
if (NOT WITH_C_API)
......@@ -111,6 +108,14 @@ else()
set(THIRD_PARTY_BUILD_TYPE Release)
endif()
set(WITH_MKLML ${WITH_MKL})
if (WITH_MKL AND ${AVX2_FOUND})
set(WITH_MKLDNN ON)
else()
message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
set(WITH_MKLDNN OFF)
endif()
########################################################################################
include(external/mklml) # download mklml package
......@@ -158,14 +163,15 @@ set(EXTERNAL_LIBS
)
if(WITH_GPU)
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
endif(NOT WITH_DSO)
include(cuda)
endif(WITH_GPU)
if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif()
if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif()
if(USE_NNPACK)
......
......@@ -76,27 +76,14 @@ else()
include_directories(${CUDA_TOOLKIT_INCLUDE})
endif(NOT WITH_GPU)
if(WITH_MKLDNN)
add_definitions(-DPADDLE_USE_MKLDNN)
if (WITH_MKLML AND MKLDNN_IOMP_DIR)
message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
set(OPENMP_FLAGS "-fopenmp")
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
else()
find_package(OpenMP)
if(OPENMP_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
else()
message(WARNING "Can not find OpenMP."
"Some performance features in MKLDNN may not be available")
endif()
endif()
endif(WITH_MKLDNN)
if (WITH_MKLML AND MKLML_IOMP_LIB)
message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
set(OPENMP_FLAGS "-fopenmp")
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
......
......@@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
# Set the architecture for iOS
if(NOT DEFINED IOS_ARCH)
if(IOS_PLATFORM STREQUAL "OS")
# FIXME(liuyiqun): support "armv7;armv7s;arm64" future
set(IOS_ARCH "arm64")
set(IOS_ARCH "armv7;armv7s;arm64")
elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
# FIXME(liuyiqun): support "i386;x86_64" future
set(IOS_ARCH "x86_64")
set(IOS_ARCH "i386;x86_64")
endif()
endif()
set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
......@@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_
# Hidden visibilty is required for cxx on iOS
set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
......
if(NOT WITH_GPU)
return()
endif()
set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs7 "30 35 50 52")
set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
######################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled)
# Usage:
# detect_installed_gpus(out_variable)
function(detect_installed_gpus out_variable)
if(NOT CUDA_gpu_detect_output)
set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
file(WRITE ${cufile} ""
"#include <cstdio>\n"
"int main() {\n"
" int count = 0;\n"
" if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
" if (count == 0) return -1;\n"
" for (int device = 0; device < count; ++device) {\n"
" cudaDeviceProp prop;\n"
" if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
" std::printf(\"%d.%d \", prop.major, prop.minor);\n"
" }\n"
" return 0;\n"
"}\n")
execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
"--run" "${cufile}"
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(nvcc_res EQUAL 0)
# only keep the last line of nvcc_out
STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
list(GET nvcc_out -1 nvcc_out)
string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
endif()
endif()
if(NOT CUDA_gpu_detect_output)
message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
else()
set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
endif()
endfunction()
########################################################################
# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
# Usage:
# select_nvcc_arch_flags(out_variable)
function(select_nvcc_arch_flags out_variable)
# List of arch names
set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
set(archs_name_default "All")
if(NOT CMAKE_CROSSCOMPILING)
list(APPEND archs_names "Auto")
endif()
# set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
mark_as_advanced(CUDA_ARCH_NAME)
# verify CUDA_ARCH_NAME value
if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
string(REPLACE ";" ", " archs_names "${archs_names}")
message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
endif()
if(${CUDA_ARCH_NAME} STREQUAL "Manual")
set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
else()
unset(CUDA_ARCH_BIN CACHE)
unset(CUDA_ARCH_PTX CACHE)
endif()
if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
set(cuda_arch_bin "30 35")
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
set(cuda_arch_bin "50")
elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
set(cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
set(cuda_arch_bin "70")
elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs})
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
detect_installed_gpus(cuda_arch_bin)
else() # (${CUDA_ARCH_NAME} STREQUAL "Manual")
set(cuda_arch_bin ${CUDA_ARCH_BIN})
endif()
# remove dots and convert to lists
string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}")
list(REMOVE_DUPLICATES cuda_arch_bin)
list(REMOVE_DUPLICATES cuda_arch_ptx)
set(nvcc_flags "")
set(nvcc_archs_readable "")
# Tell NVCC to add binaries for the specified GPUs
foreach(arch ${cuda_arch_bin})
if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
# User explicitly specified PTX for the concrete BIN
list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
else()
# User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
list(APPEND nvcc_archs_readable sm_${arch})
endif()
endforeach()
# Tell NVCC to add PTX intermediate code for the specified architectures
foreach(arch ${cuda_arch_ptx})
list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
list(APPEND nvcc_archs_readable compute_${arch})
endforeach()
string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
set(${out_variable} ${nvcc_flags} PARENT_SCOPE)
set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
endfunction()
message(STATUS "CUDA detected: " ${CUDA_VERSION})
if (${CUDA_VERSION} LESS 7.0)
set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
# CUDA 8 may complain that sm_20 is no longer supported. Suppress the
# warning for now.
list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
endif()
include_directories(${CUDA_INCLUDE_DIRS})
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
endif(NOT WITH_DSO)
# setting nvcc arch flags
select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
# Set C++11 support
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
# Set :expt-relaxed-constexpr to suppress Eigen warnings
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
endif()
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
......@@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
SET(MKLDNN_DEPENDS ${MKLML_PROJECT})
SET(MKLDNN_MKLROOT ${MKLML_ROOT})
SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB})
SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR})
MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
ELSE()
MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
ENDIF()
SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
......@@ -57,15 +56,16 @@ ExternalProject_Add(
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT}
CMAKE_ARGS -DMKLROOT=${MKLML_ROOT}
CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-DMKLROOT:PATH=${MKLDNN_MKLROOT}
-DMKLROOT:PATH=${MKLML_ROOT}
)
ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
add_definitions(-DPADDLE_USE_MKLDNN)
LIST(APPEND external_project_dependencies mkldnn)
......@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND})
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
ENDIF()
ELSEIF(IOS)
# FIXME(liuyiqun): support multiple architectures
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
ELSE()
MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
"You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
ENDIF()
ELSEIF(RPI)
# use hardfp
......
......@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
IF(MOBILE_INFERENCE)
return()
ENDIF()
INCLUDE(ExternalProject)
SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
......
......@@ -149,58 +149,3 @@ endforeach()
foreach(flag ${GPU_COMMON_FLAGS})
safe_set_nvflag(${flag})
endforeach()
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
endif()
function(specify_cuda_arch cuda_version cuda_arch)
if(${cuda_version} VERSION_GREATER "8.0")
foreach(capability 61 62)
if(${cuda_arch} STREQUAL ${capability})
list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
endif()
endforeach()
elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
endif()
endfunction()
# Common gpu architectures: Kepler, Maxwell
foreach(capability 30 35 50)
list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
endforeach()
if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
endif()
# Modern gpu architectures: Pascal
if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
endif()
# Custom gpu architecture
set(CUDA_ARCH)
if(CUDA_ARCH)
specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
endif()
set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
......@@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME)
target_link_libraries(${TARGET_NAME} log)
endif(ANDROID)
if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
endif()
add_dependencies(${TARGET_NAME} ${external_project_dependencies})
......
......@@ -335,6 +335,16 @@ bilinear_interp
.. autoclass:: paddle.v2.layer.bilinear_interp
:noindex:
dot_prod
---------
.. autoclass:: paddle.v2.layer.dot_prod
:noindex:
out_prod
--------
.. autoclass:: paddle.v2.layer.out_prod
:noindex:
power
-----
.. autoclass:: paddle.v2.layer.power
......
......@@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA.
我们把集成方案大致分为了如下几个方面。
### CMake
我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项,当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能
我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关,他是负责`WITH_MKLML``WITH_MKLDNN`的总开关
同时,我们会引入`WITH_MKLML`选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。
当打开`WITH_MKL`时,会开启MKLML的功能,作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上,同时会开启MKL-DNN功能。
所以,我们会在`cmake/external`目录新建`mkldnn.cmake``mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中
当关闭`WITH_MKL`时,MKLML和MKL-DNN功能会同时关闭
**备注**:当`WITH_MKLML=ON`的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动`cmake/cblas.cmake`中的逻辑
所以,我们会在`cmake/external`目录新建`mkldnn.cmake``mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中
### Layers
所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在
......
......@@ -34,7 +34,7 @@ PaddlePaddle的文档构建有两种方式。
cd TO_YOUR_PADDLE_CLONE_PATH
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
make gen_proto_py
make paddle_docs paddle_docs_cn
......
# 构建Android平台上的PaddlePaddle库
# Android平台编译指南
用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库:
- 基于Docker容器的编译方式
......
# 构建iOS平台上的PaddlePaddle库
# iOS平台编译指南
交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。
## 准备交叉编译环境
......@@ -25,7 +25,7 @@ iOS平台可选配置参数:
- `IOS_PLATFORM`,可设置为`OS/SIMULATOR`,默认值为`OS`
- `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。
- `SIMULATOR`,构建目标为`x86`架构的模拟器平台。
- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示:
- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示,默认编译所有架构
<table class="docutils">
<colgroup>
......@@ -41,11 +41,11 @@ iOS平台可选配置参数:
<tbody valign="top">
<tr class="row-even">
<td>OS</td>
<td>armv7, armv7s, arm64 (默认)</td>
<td>armv7, armv7s, arm64 </td>
</tr>
<tr class="row-odd">
<td>SIMULATOR</td>
<td>i386, x86_64 (默认)</td>
<td>i386, x86_64 </td>
</tr>
</tbody>
</table>
......@@ -66,7 +66,7 @@ iOS平台可选配置参数:
```bash
cmake -DCMAKE_SYSTEM_NAME=iOS \
-DIOS_PLATFORM=OS \
-DIOS_ARCH="arm64" \
-DIOS_ARCH="armv7;arm64" \
-DIOS_ENABLE_BITCODE=ON \
-DIOS_USE_VECLIB_FOR_BLAS=ON \
-DCMAKE_INSTALL_PREFIX=your/path/to/install \
......@@ -112,6 +112,6 @@ $ make install
- `lib`目录,其中包含PaddlePaddle的C-API静态库
- `third_party`目录,其中包含所依赖的所有第三方库
注意,不同架构的PaddlePaddle库建议安装到不同的目录下,然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。
注意,如果PaddlePaddle库需要同时支持真机和模拟器,则需要分别编译真机和模拟器版本,然后使用`lipo`工具合并fat库。
自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。
# 构建Raspberry Pi平台上的PaddlePaddle库
# Raspberry Pi平台编译指南
通常有两个方法来构建基于 Rasspberry Pi 的版本:
......
......@@ -25,7 +25,9 @@ limitations under the License. */
#include "hl_matrix.h"
#include "hl_sequence.h"
#include "hl_sparse.h"
#ifndef PADDLE_MOBILE_INFERENCE
#include "hl_warpctc_wrap.h"
#endif
#ifdef HPPL_STUB_FUNC
#include "stub/hl_aggregate_stub.h"
......
......@@ -73,7 +73,6 @@ if(MOBILE_INFERENCE)
list(REMOVE_ITEM GSERVER_SOURCES
dataproviders/DataProvider.cpp
dataproviders/MultiDataProvider.cpp
dataproviders/ProtoDataProvider.cpp
dataproviders/PyDataProvider2.cpp
dataproviders/PyDataProvider.cpp)
......
......@@ -16,8 +16,8 @@ limitations under the License. */
#include <unistd.h>
#include <algorithm>
#include "ProtoDataProvider.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h"
......@@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config,
REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
REGISTER_DATA_PROVIDER(proto, ProtoDataProvider);
REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider);
int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "ProtoDataProvider.h"
#include <algorithm>
#include <fstream>
#include <istream>
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h"
#include "DataProviderGroup.h"
#include "paddle/utils/Logging.h"
DEFINE_double(memory_threshold_on_load_data,
1.0,
"stop loading data when memory is not sufficient");
namespace paddle {
REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup<ProtoDataProvider>);
REGISTER_DATA_PROVIDER(proto_sequence_group,
DataProviderGroup<ProtoSequenceDataProvider>);
ProtoDataProvider::ProtoDataProvider(const DataConfig& config,
bool useGpu,
bool loadDataAll)
: DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
if (loadDataAll) {
loadData(config_.files());
}
}
void ProtoDataProvider::loadData(const std::vector<std::string>& fileList) {
for (auto& file : fileList) {
if (FLAGS_memory_threshold_on_load_data < 1.0) {
double memUsage = getMemoryUsage();
if (memUsage > FLAGS_memory_threshold_on_load_data) {
LOG(INFO) << "memUsage is " << memUsage << ", > "
<< FLAGS_memory_threshold_on_load_data
<< " therefore SKIP ALL REMAINING file.";
break;
}
}
LOG(INFO) << "load data file " << file;
loadDataFile(file);
}
if (sequenceStartPositions_.size() == sampleNums_) {
// This means that each sample is one sequence
shuffledSequenceIds_.swap(sequenceStartPositions_);
} else {
sequenceStartPositions_.push_back(sampleNums_);
shuffledSequenceIds_.reserve(sequenceStartPositions_.size() - 1);
for (size_t i = 0; i < sequenceStartPositions_.size() - 1; ++i) {
shuffledSequenceIds_.push_back(i);
}
}
LOG(INFO) << "read done, num of instance=" << sampleNums_;
showDataStats();
}
void ProtoDataProvider::loadData(const std::string& fileName) {
std::vector<std::string> fileList;
loadFileList(fileName, fileList);
loadData(fileList);
}
void ProtoDataProvider::checkDataHeader(const DataHeader& header) {
if (header_.slot_defs_size()) {
// header_ is already set. Need to check consistency.
CHECK_EQ(header_.slot_defs_size(), header.slot_defs_size())
<< "Different header";
for (int i = 0; i < header.slot_defs_size(); ++i) {
CHECK_EQ(header_.slot_defs(i).type(), header.slot_defs(i).type());
CHECK_EQ(header_.slot_defs(i).dim(), header.slot_defs(i).dim());
}
return;
}
// header_ is not set before
CHECK(header.slot_defs_size()) << "Invalid header: no slot is defined";
int i;
for (i = 0; i < header.slot_defs_size(); ++i) {
if (header.slot_defs(i).type() == SlotDef::INDEX ||
header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX) {
break;
}
constexpr int kBufLen = 100;
char buf[kBufLen];
snprintf(buf, kBufLen, "slot%d_nnz", i);
nnzStats_.push_back(getStat(buf));
}
numVecSlots_ = i;
// Check that INDEX slots are after VECTOR slots
for (int i = numVecSlots_; i < header.slot_defs_size(); ++i) {
CHECK(header.slot_defs(i).type() == SlotDef::INDEX ||
header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX);
}
slots_.clear();
slots_.reserve(header.slot_defs_size());
for (int i = 0; i < header.slot_defs_size(); ++i) {
slots_.emplace_back();
slots_.back().type = header.slot_defs(i).type();
slots_.back().dim = header.slot_defs(i).dim();
if (SlotDef::VECTOR_SPARSE_NON_VALUE == header.slot_defs(i).type() ||
SlotDef::VECTOR_SPARSE_VALUE == header.slot_defs(i).type()) {
slots_.back().indices.push_back(0);
}
}
header_ = header;
}
void ProtoDataProvider::checkSample(const DataSample& sample) {
CHECK_EQ(numVecSlots_, sample.vector_slots_size());
CHECK(header_.slot_defs_size() == numVecSlots_ + sample.id_slots_size() ||
header_.slot_defs_size() == numVecSlots_ + sample.var_id_slots_size());
for (int i = 0; i < numVecSlots_; ++i) {
uint32_t dim = header_.slot_defs(i).dim();
switch (header_.slot_defs(i).type()) {
case SlotDef::VECTOR_DENSE: {
CHECK_EQ(static_cast<int>(dim), sample.vector_slots(i).values_size());
CHECK_EQ(0, sample.vector_slots(i).ids_size());
break;
}
case SlotDef::VECTOR_SPARSE_NON_VALUE: {
if (0 == sample.vector_slots(i).ids_size()) {
break;
}
CHECK_LT(0, sample.vector_slots(i).ids_size());
CHECK_EQ(0, sample.vector_slots(i).values_size());
auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
sample.vector_slots(i).ids().end());
CHECK_GT(dim, maxId);
break;
}
case SlotDef::VECTOR_SPARSE_VALUE: {
if (0 == sample.vector_slots(i).ids_size()) {
CHECK_EQ(0, sample.vector_slots(i).values_size());
break;
}
CHECK_LT(0, sample.vector_slots(i).values_size());
CHECK_GE(static_cast<int>(dim), sample.vector_slots(i).values_size());
CHECK_EQ(sample.vector_slots(i).values_size(),
sample.vector_slots(i).ids_size());
auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
sample.vector_slots(i).ids().end());
CHECK_GT(dim, maxId);
break;
}
case SlotDef::VAR_MDIM_DENSE: {
if (static_cast<int>(dim) != 0) {
CHECK_EQ(static_cast<int>(dim), sample.vector_slots(i).values_size());
if (sample.vector_slots(i).dims_size() != 0) {
int totalDim = sample.vector_slots(i).dims(0);
for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
totalDim *= sample.vector_slots(i).dims(j);
}
CHECK_EQ(static_cast<int>(dim), totalDim);
}
} else {
CHECK_NE(sample.vector_slots(i).dims_size(), 0);
int totalDim = sample.vector_slots(i).dims(0);
for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
totalDim *= sample.vector_slots(i).dims(j);
}
CHECK_EQ(totalDim, sample.vector_slots(i).values_size());
}
break;
}
case SlotDef::STRING: {
CHECK_EQ(static_cast<int>(1), sample.vector_slots(i).strs_size());
CHECK_EQ(0, sample.vector_slots(i).ids_size());
CHECK_EQ(0, sample.vector_slots(i).values_size());
break;
}
default:
LOG(FATAL) << "BUG: Should not reach here";
}
}
for (int i = numVecSlots_; i < header_.slot_defs_size(); ++i) {
if (header_.slot_defs(i).type() != SlotDef::VAR_MDIM_INDEX) {
uint32_t id = sample.id_slots(i - numVecSlots_);
if (id == -1U) continue;
CHECK_LT(id, header_.slot_defs(i).dim());
} else {
for (int j = 0; j < sample.var_id_slots(i - numVecSlots_).ids_size();
++j) {
uint32_t id = sample.var_id_slots(i - numVecSlots_).ids(j);
CHECK_LT(id, header_.slot_defs(i).dim());
}
}
}
}
void ProtoDataProvider::loadDataFile(const std::string& fileName) {
std::ifstream is(fileName);
CHECK(is) << "Fail to open " << fileName;
bool dataCompression = str::endsWith(fileName, ".gz");
std::unique_ptr<ProtoReader> reader(new ProtoReader(&is, dataCompression));
CHECK(reader) << "Fail to create proto data input stream";
DataHeader header;
CHECK(reader->read(&header));
checkDataHeader(header);
DataSample sample;
do {
if (!reader->read(&sample)) {
break;
}
checkSample(sample);
if (sample.is_beginning()) {
sequenceStartPositions_.push_back(sampleNums_);
}
fillSlots(sample);
++sampleNums_;
} while (true);
CHECK(is.eof()) << "Fail to read file";
reader.reset(nullptr);
is.close();
}
// checkSample has done before, no check here
void ProtoDataProvider::fillSlots(const DataSample& sample) {
for (size_t i = 0; i < slots_.size(); ++i) {
auto& slot = slots_[i];
int dim = slot.dim;
switch (slot.type) {
case SlotDef::VECTOR_DENSE: {
size_t oldSize = slot.denseData.size();
slot.denseData.resize(oldSize + dim);
const float* values = sample.vector_slots(i).values().data();
#ifdef PADDLE_TYPE_DOUBLE
std::copy(values, values + dim, slot.denseData.begin() + oldSize);
#else
memcpy(slot.denseData.data() + oldSize, values, sizeof(real) * dim);
#endif
break;
}
case SlotDef::VECTOR_SPARSE_NON_VALUE: {
int slotSize = sample.vector_slots(i).ids_size();
int subSlotSize = 0;
int id = 0; // the slot id
// find whether this vector_slots has subseq. If not has subseq,
// subSlotSize = 0.
for (id = 0; id < sample.subseq_slots_size(); id++) {
if (sample.subseq_slots(id).slot_id() == i) {
subSlotSize = sample.subseq_slots(id).lens_size();
break;
}
}
if (subSlotSize && slot.subIndices.size() == 0UL) {
// If has subSeq, the first element of subIndices = 0.
slot.subIndices.push_back(0);
}
if (slotSize == 0UL) {
// if has no id, new indices = old indices.
slot.indices.push_back(slot.indices.back());
// if has subSeq, new subIndices = old subIndices.
if (slot.subIndices.size()) {
slot.subIndices.push_back(slot.subIndices.back());
}
break;
}
slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
const unsigned int* ids = sample.vector_slots(i).ids().data();
memcpy(slot.sparseNonValueData.data() + slot.indices.back(),
ids,
sizeof(*ids) * slotSize);
slot.indices.push_back(slot.indices.back() + slotSize);
if (subSlotSize) {
for (int ii = 0; ii < subSlotSize; ++ii) {
slot.subIndices.push_back(slot.subIndices.back() +
sample.subseq_slots(id).lens(ii));
}
}
break;
}
case SlotDef::VECTOR_SPARSE_VALUE: {
if (0 == sample.vector_slots(i).ids_size()) {
slot.indices.push_back(slot.indices.back());
break;
}
int slotSize = sample.vector_slots(i).ids_size();
slot.sparseFloatValueData.resize(slot.indices.back() + slotSize);
const unsigned int* ids = sample.vector_slots(i).ids().data();
const float* values = sample.vector_slots(i).values().data();
for (int ii = 0; ii < slotSize; ++ii) {
slot.sparseFloatValueData[slot.indices.back() + ii].col = ids[ii];
slot.sparseFloatValueData[slot.indices.back() + ii].value =
values[ii];
}
slot.indices.push_back(slot.indices.back() + slotSize);
break;
}
case SlotDef::INDEX: {
slot.indexData.push_back(sample.id_slots(i - numVecSlots_));
break;
}
case SlotDef::VAR_MDIM_DENSE: {
size_t oldSize = slot.varDenseData.size();
slot.varDenseData.resize(oldSize + 1);
size_t varDim = sample.vector_slots(i).values_size();
slot.varDenseData[oldSize].data.resize(varDim);
const float* values = sample.vector_slots(i).values().data();
#ifdef PADDLE_TYPE_DOUBLE
std::copy(
values, values + varDim, slot.varDenseData[oldSize].data.data());
#else
memcpy(slot.varDenseData[oldSize].data.data(),
values,
sizeof(real) * varDim);
#endif
slot.varDenseData[oldSize].dims.resize(
sample.vector_slots(i).dims_size());
memcpy(slot.varDenseData[oldSize].dims.data(),
sample.vector_slots(i).dims().data(),
sizeof(uint32_t) * sample.vector_slots(i).dims_size());
break;
}
case SlotDef::VAR_MDIM_INDEX: {
size_t oldSize = slot.varIndices.size();
slot.varIndices.resize(oldSize + 1);
size_t varDim = sample.var_id_slots(i - numVecSlots_).ids_size();
slot.varIndices[oldSize].resize(varDim);
memcpy(slot.varIndices[oldSize].data(),
sample.var_id_slots(i - numVecSlots_).ids().data(),
sizeof(uint32_t) * varDim);
break;
}
case SlotDef::STRING: {
slot.strData.push_back(sample.vector_slots(i).strs(0));
break;
}
}
}
}
void ProtoDataProvider::showDataStats() {
std::ostringstream oss;
for (size_t i = 0; i < slots_.size(); ++i) {
auto& slot = slots_[i];
if (slot.type == SlotDef::VECTOR_SPARSE_NON_VALUE) {
size_t nnz = slot.sparseNonValueData.size();
oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
} else if (slot.type == SlotDef::VECTOR_SPARSE_VALUE) {
size_t nnz = slot.sparseFloatValueData.size();
oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
}
}
LOG(INFO) << oss.str();
}
void ProtoDataProvider::reset() {
currentSequenceIndex_ = 0;
if (!skipShuffle_) {
shuffle();
}
DataProvider::reset();
}
void ProtoDataProvider::shuffle() {
std::shuffle(shuffledSequenceIds_.begin(),
shuffledSequenceIds_.end(),
ThreadLocalRandomEngine::get());
}
/*
Loop through sequences starting from currentSequenceIndex_
for at most size samples. For each sequence ranging from [begin, end),
op(begin, end) will be called.
return the number of sequences scanned
*/
template <class Op>
int64_t ProtoDataProvider::sequenceLoop(Op op, int64_t size) {
int64_t sz = 0;
size_t i;
size_t sequenceCount = shuffledSequenceIds_.size();
if (usageRatio_ < 1.0f) {
sequenceCount = static_cast<int64_t>(sequenceCount * usageRatio_);
}
for (i = currentSequenceIndex_; i < sequenceCount; ++i) {
size_t id = shuffledSequenceIds_[i];
int64_t begin = sequenceStartPositions_[id];
int64_t end = sequenceStartPositions_[id + 1];
int64_t len = end - begin;
if (sz + len > size && sz > 0) break;
sz += len;
op(begin, end);
}
return i - currentSequenceIndex_;
}
/*
Loop through sequences starting from currentSequenceIndex_
for at most size samples. For each sample of each sequence at position
pos, op(pos) will be called.
return the number of sequences scanned
*/
template <class Op>
int64_t ProtoDataProvider::sampleLoop(Op op, int64_t size) {
if (iidData()) {
size = std::min<int64_t>(sampleNums_ - currentSequenceIndex_, size);
for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
++i) {
size_t pos = shuffledSequenceIds_[i];
op(pos);
}
return size;
} else {
auto f = [op](int64_t begin, int64_t end) {
for (int64_t pos = begin; pos < end; ++pos) {
op(pos);
}
};
return sequenceLoop(f, size);
}
}
/*
Loop through sub-sequences starting from currentSequenceIndex_
for at most size samples. For each sample of each sub-sequence at position
pos, op(pos) will be called.
return the number of sub-sequences scanned
*/
template <class Op>
int64_t ProtoDataProvider::subSampleLoop(Op op, int64_t size, int slot) {
CHECK(iidData()) << "subSampleLoop only accepts iid data";
size = std::min<int64_t>(sampleNums_ - currentSequenceIndex_, size);
int subSize = 0;
for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
++i) {
size_t pos = shuffledSequenceIds_[i];
int64_t* indexs = slots_[slot].indices.data();
int64_t* subIndexs = slots_[slot].subIndices.data();
int64_t subSeqStart = 0;
int64_t subSeqEnd = 0;
for (int j = 0; j < (int)slots_[slot].subIndices.size(); j++) {
if (subIndexs[j] == indexs[pos]) {
subSeqStart = j;
if (subIndexs[pos] == subIndexs[pos + 1]) {
subSeqEnd = j + 1;
break;
}
} else if (subIndexs[j] == indexs[pos + 1]) {
subSeqEnd = j;
break;
}
}
for (int j = subSeqStart; j < subSeqEnd; j++) {
op(j);
}
subSize += subSeqEnd - subSeqStart;
}
return subSize;
}
int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
DataBatch* batch) {
int64_t numSequences = 0; // actual number of sequences in the batch
// the number of sequences scanned, including those skipped because too long
int64_t numScannedSeqs = 0;
std::lock_guard<RWLock> guard(lock_);
if (iidData()) {
size = std::min<int64_t>(getSize() - currentSequenceIndex_, size);
numScannedSeqs = numSequences = size;
} else {
int64_t sz = 0;
auto op = [&sz, &numSequences](int64_t begin, int64_t end) {
++numSequences;
sz += end - begin;
};
numScannedSeqs = sequenceLoop(op, size);
VLOG_IF(1, numScannedSeqs > numSequences)
<< numScannedSeqs - numSequences
<< " sequences are skipped because longer than " << size;
size = sz;
}
if (size <= 0) return 0;
DataBatch& cpuBatch = *cpuBatch_;
std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
cpuBatch.setSize(size);
cpuArguments.resize(header_.slot_defs_size());
if (!iidData()) {
ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
numSequences + 1,
/* useGpu= */ false);
int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
int pos = 0;
int i = 0;
auto op = [buf, &pos, &i](int64_t begin, int64_t end) {
buf[i] = pos;
pos += end - begin;
++i;
};
sequenceLoop(op, size);
buf[i] = size;
for (size_t slot = 1; slot < cpuArguments.size(); ++slot) {
cpuArguments[slot].sequenceStartPositions =
cpuArguments[0].sequenceStartPositions;
}
}
for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
size_t dim = header_.slot_defs(slot).dim();
SlotDef::SlotType slotType = header_.slot_defs(slot).type();
std::vector<int64_t> dataPos;
dataPos.reserve(size);
auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
sampleLoop(op, size);
switch (slotType) {
case SlotDef::VECTOR_DENSE: {
Matrix::resizeOrCreate(cpuArguments[slot].value,
size,
dim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slot].value->getData();
for (int i = 0; i < size; ++i) {
memcpy(buf + i * dim,
slots_[slot].denseData.data() + dataPos[i] * dim,
sizeof(real) * dim);
}
break;
}
case SlotDef::VECTOR_SPARSE_NON_VALUE: {
if (!(cpuArguments[slot].value)) {
cpuArguments[slot].value =
Matrix::createSparseMatrix(size,
dim,
size /*DEFAULT_AVG_WIDTH = 1*/,
NO_VALUE,
SPARSE_CSR,
false,
useGpu_);
}
auto mat = cpuArguments[slot].value;
mat->resize(size, dim);
if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
dataPos.data(),
slots_[slot].indices.data(),
slots_[slot].sparseNonValueData.data(),
HPPL_STREAM_1);
} else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
dataPos.data(),
slots_[slot].indices.data(),
slots_[slot].sparseNonValueData.data());
} else {
LOG(FATAL) << "Not Supported";
}
size_t numElements = 0;
for (auto pos : dataPos) {
numElements +=
slots_[slot].indices[pos + 1] - slots_[slot].indices[pos];
}
nnzStats_[slot]->addSample(numElements);
break;
}
case SlotDef::VECTOR_SPARSE_VALUE: {
if (!(cpuArguments[slot].value)) {
cpuArguments[slot].value =
Matrix::createSparseMatrix(size,
dim,
size /*DEFAULT_AVG_WIDTH = 1*/,
FLOAT_VALUE,
SPARSE_CSR,
false,
useGpu_);
}
auto mat = cpuArguments[slot].value;
mat->resize(size, dim);
if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
dataPos.data(),
slots_[slot].indices.data(),
slots_[slot].sparseFloatValueData.data(),
HPPL_STREAM_1);
} else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
dataPos.data(),
slots_[slot].indices.data(),
slots_[slot].sparseFloatValueData.data());
} else {
LOG(FATAL) << "Not Supported";
}
break;
}
case SlotDef::INDEX: {
IVector::resizeOrCreate(cpuArguments[slot].ids,
size,
/* useGpu= */ false);
int* buf = cpuArguments[slot].ids->getData();
for (int i = 0; i < size; ++i) {
buf[i] = slots_[slot].indexData[dataPos[i]];
}
break;
}
case SlotDef::VAR_MDIM_DENSE: {
CHECK_EQ(size, 1);
auto mat = cpuArguments[slot].value;
size_t totalDim = slots_[slot].varDenseData[dataPos[0]].data.size();
CHECK_EQ(slots_[slot].varDenseData[dataPos[0]].dims.size(), size_t(3));
size_t height, width, depth, oldWidth;
/* dims[2] is depth, will be changed to dims[0] in future */
depth = slots_[slot].varDenseData[dataPos[0]].dims[2];
height = slots_[slot].varDenseData[dataPos[0]].dims[1];
width = slots_[slot].varDenseData[dataPos[0]].dims[0];
oldWidth = width;
/* process the undesirable sample */
if (oldWidth < height) {
width = height;
}
cpuArguments[slot].setFrameHeight(height);
cpuArguments[slot].setFrameWidth(width);
if (oldWidth < height) {
totalDim = width * height * depth;
}
Matrix::resizeOrCreate(cpuArguments[slot].value,
size,
totalDim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slot].value->getData();
cpuArguments[slot].value->zeroMem();
if (oldWidth < height) {
real* srcBuf = slots_[slot].varDenseData[dataPos[0]].data.data();
for (size_t i = 0; i < depth; i++) {
for (size_t j = 0; j < height; j++) {
for (size_t k = 0; k < oldWidth; k++) {
buf[i * height * width + j * width + k] =
srcBuf[i * height * oldWidth + j * oldWidth + k];
}
}
}
} else {
memcpy(buf,
slots_[slot].varDenseData[dataPos[0]].data.data(),
sizeof(real) * totalDim);
}
ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
size + 1, /* size == 1 currently */
/* useGpu= */ false);
int* bufStarts =
cpuArguments[slot].sequenceStartPositions->getMutableData(false);
bufStarts[0] = 0;
bufStarts[1] = 1;
break;
}
case SlotDef::VAR_MDIM_INDEX: {
CHECK_EQ(size, 1);
size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
IVector::resizeOrCreate(cpuArguments[slot].ids,
totalDim,
/* useGpu= */ false);
int* buf = cpuArguments[slot].ids->getData();
memcpy(buf,
slots_[slot].varIndices[dataPos[0]].data(),
sizeof(int) * totalDim);
ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
size + 1, /* size == 1 currently */
/* useGpu= */ false);
int* bufStarts =
cpuArguments[slot].sequenceStartPositions->getMutableData(false);
bufStarts[0] = 0;
/* we expand the convolutinal feature map to a sequence data,
* so there should be a corresponding sequence labels */
bufStarts[1] = totalDim;
break;
}
case SlotDef::STRING: {
if (cpuArguments[slot].strs) {
cpuArguments[slot].strs->resize(size);
} else {
cpuArguments[slot].strs =
std::make_shared<std::vector<std::string>>(size);
}
for (int i = 0; i < size; ++i) {
(*cpuArguments[slot].strs)[i] = slots_[slot].strData[dataPos[i]];
}
break;
}
}
}
if (useGpu_) {
std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
DataBatch& gpuBatch = *gpuBatch_;
std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
gpuArguments.resize(cpuArguments.size());
gpuBatch.setSize(size);
for (int i = 0; i < header_.slot_defs_size(); ++i) {
SlotDef::SlotType slotType = header_.slot_defs(i).type();
if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
gpuArguments[i] = cpuArguments[i];
gpuArguments[i].sequenceStartPositions =
cpuArguments[i].sequenceStartPositions;
} else {
gpuArguments[i].resizeAndCopyFrom(
cpuArguments[i], useGpu_, HPPL_STREAM_1);
}
}
hl_stream_synchronize(HPPL_STREAM_1);
*batch = gpuBatch;
} else {
*batch = cpuBatch;
}
currentSequenceIndex_ += numScannedSeqs;
return batch->getSize();
}
ProtoSequenceDataProvider::ProtoSequenceDataProvider(const DataConfig& config,
bool useGpu,
bool loadDataAll)
: ProtoDataProvider(config, useGpu, loadDataAll) {}
int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
DataBatch* batch) {
CHECK(iidData()) << "ProtoSequenceDataProvider only accepts iid data";
int64_t numSequences = 0; // actual number of sequences in the batch
// the number of sequences scanned, including those skipped because too long
int64_t numScannedSeqs = 0;
std::lock_guard<RWLock> guard(lock_);
size = std::min<int64_t>(getSize() - currentSequenceIndex_, size);
numScannedSeqs = numSequences = size;
if (size <= 0) return 0;
DataBatch& cpuBatch = *cpuBatch_;
std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
cpuBatch.setSize(size);
cpuArguments.resize(header_.slot_defs_size());
for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
SlotDef::SlotType slotType = header_.slot_defs(slot).type();
std::vector<int64_t> dataPos;
dataPos.reserve(size);
auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
sampleLoop(op, size);
// current slot: sequenceStartPositions
ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
size + 1,
/* useGpu= */ false);
switch (slotType) {
case SlotDef::VECTOR_SPARSE_VALUE:
case SlotDef::VAR_MDIM_DENSE:
case SlotDef::VAR_MDIM_INDEX: {
LOG(FATAL) << "ProtoSequenceDataProvider only support"
<< " VECTOR_DENSE, VECTOR_SPARSE_NON_VALUE and INDEX slots";
break;
}
case SlotDef::VECTOR_SPARSE_NON_VALUE: {
// copy to IDS, not value
// pointers used in current slot
sparse_non_value_t* data = slots_[slot].sparseNonValueData.data();
int64_t* indexs = slots_[slot].indices.data();
int64_t* seqs = dataPos.data();
// current slot: i need size instances. what is the total length?
int totalFeatureInCurrentSlot = 0;
for (int ins = 0; ins < size; ins++) {
int64_t currInsId = seqs[ins];
totalFeatureInCurrentSlot +=
indexs[currInsId + 1] - indexs[currInsId];
// special: if current instance has NO feature in current slot
if (indexs[currInsId + 1] == indexs[currInsId]) {
totalFeatureInCurrentSlot++;
}
}
// done
// current slot: ids
IVector::resizeOrCreate(cpuArguments[slot].ids,
totalFeatureInCurrentSlot,
/* useGpu= */ false);
// where to write
int* currPosOfArgumentId = cpuArguments[slot].ids->getData();
int* currPosOfArgumentSeqStart =
cpuArguments[slot].sequenceStartPositions->getMutableData(false);
int allSequenceLength = 0;
currPosOfArgumentSeqStart[0] = 0;
// for each instance, copy data and fill sequence positions
for (int instance = 0; instance < size; instance++) {
int64_t currInstanceId = seqs[instance];
int64_t currInstanceLength =
indexs[currInstanceId + 1] - indexs[currInstanceId];
sparse_non_value_t* currInstanceData = data + indexs[currInstanceId];
// write sequenceStartPositions
allSequenceLength += currInstanceLength;
currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
// copy features
for (int featCopier = 0; featCopier < currInstanceLength;
featCopier++) {
currPosOfArgumentId[featCopier] = currInstanceData[featCopier].col;
}
currPosOfArgumentId += currInstanceLength;
// special: if current instance has NO feature in current slot
if (currInstanceLength == 0) {
allSequenceLength++;
currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
currPosOfArgumentId[0] = -1;
currPosOfArgumentId++;
}
// done
}
if (slots_[slot].subIndices.size()) {
std::vector<int64_t> dataSubPos;
auto op = [this, &dataSubPos](int64_t pos) {
dataSubPos.push_back(pos);
};
int subSize = subSampleLoop(op, size, slot);
ICpuGpuVector::resizeOrCreate(
cpuArguments[slot].subSequenceStartPositions, subSize + 1, false);
int* currPosOfArgumentSubSeqStart =
cpuArguments[slot].subSequenceStartPositions->getMutableData(
false);
int64_t* subSeqs = dataSubPos.data();
int64_t* subIndexs = slots_[slot].subIndices.data();
int allSubSequenceLength = 0;
currPosOfArgumentSubSeqStart[0] = 0;
// for each instance, compute sub-sequence number
for (int instance = 0; instance < subSize; instance++) {
int64_t currSubInstanceId = subSeqs[instance];
int64_t currSubInstanceLength =
subIndexs[currSubInstanceId + 1] - subIndexs[currSubInstanceId];
// write subSequenceStartPositions
allSubSequenceLength += currSubInstanceLength;
currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
// special: if current instance has NO feature in current slot
if (currSubInstanceLength == 0) {
allSubSequenceLength++;
currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
}
}
cpuArguments[slot].checkSubset();
}
break;
}
case SlotDef::INDEX: {
// label slot
IVector::resizeOrCreate(cpuArguments[slot].ids,
size,
/* useGpu= */ false);
// fill labels
int* buf = cpuArguments[slot].ids->getData();
for (int i = 0; i < size; ++i) {
buf[i] = slots_[slot].indexData[dataPos[i]];
}
// label HAS sequence structure
cpuArguments[slot].sequenceStartPositions->fillSequence(false);
break;
}
case SlotDef::VECTOR_DENSE: {
// copy values
size_t dim = header_.slot_defs(slot).dim();
Matrix::resizeOrCreate(cpuArguments[slot].value,
size,
dim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slot].value->getData();
for (int i = 0; i < size; ++i) {
memcpy(buf + i * dim,
slots_[slot].denseData.data() + dataPos[i] * dim,
sizeof(real) * dim);
}
// sequence structure
cpuArguments[slot].sequenceStartPositions->fillSequence(false);
break;
}
default: { LOG(FATAL) << "should not reach here"; }
}
}
if (useGpu_) {
std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
DataBatch& gpuBatch = *gpuBatch_;
std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
gpuArguments.resize(cpuArguments.size());
gpuBatch.setSize(size);
for (size_t i = 0; i < cpuArguments.size(); ++i) {
gpuArguments[i].resizeAndCopyFrom(
cpuArguments[i], useGpu_, HPPL_STREAM_1);
}
hl_stream_synchronize(HPPL_STREAM_1);
*batch = gpuBatch;
} else {
*batch = cpuBatch;
}
currentSequenceIndex_ += numScannedSeqs;
return batch->getSize();
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "DataFormat.pb.h"
#include "paddle/utils/Stat.h"
#include "DataProvider.h"
#include "ProtoReader.h"
namespace paddle {
/**
* @brief Provider data from protobuf data file with each sample
* specified by proto message
*
* DataSample defined in DataFormat.proto.
*
* The file format is
*
* header
*
* sample1
*
* sample2
*
* ...
*
* sampleN
*
* @note: In the data file, each message is prefixed with its length.
* The read/write of the protbuf are implemented in ProtoReader.h
*/
class ProtoDataProvider : public DataProvider {
public:
ProtoDataProvider(const DataConfig& config,
bool useGpu,
bool loadDataAll = true);
virtual void reset();
/**
* @note this size includes the sequences which are skipped because they
* are longer than the batch size.
*/
virtual int64_t getSize() {
int64_t size = sampleNums_;
if (usageRatio_ < 1.0f) {
size = static_cast<int64_t>(size * usageRatio_);
}
return size;
}
virtual void shuffle();
void loadData(const std::vector<std::string>& fileList);
virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
protected:
/**
* @brief load protobuf data from a list of file
* @param[in] fileName file name of a file which contains
* a list of file names
*/
void loadData(const std::string& fileName);
/**
* @brief load protobuf data from file
* @param[in] fileName data file name
*/
void loadDataFile(const std::string& fileName);
/** @brief check data header of each data sample
* @param[in] header data header read from protobuf data
*/
void checkDataHeader(const DataHeader& header);
/**
* @brief fill protobuf data into slot_,
* slot_ is a vector of ProtoSlot in memory.
* @param[in] sample data sample read from protobuf data
*/
void fillSlots(const DataSample& sample);
/**
* @brief return true if each sample is one sequence, i.e., independent
* of other samples.
*/
inline bool iidData() const { return sequenceStartPositions_.empty(); }
/**
* @brief check that sample is consistent with header_
*/
void checkSample(const DataSample& sample);
template <class Op>
int64_t sequenceLoop(Op op, int64_t size);
template <class Op>
int64_t sampleLoop(Op op, int64_t size);
template <class Op>
int64_t subSampleLoop(Op op, int64_t size, int slot);
void showDataStats();
protected:
struct ProtoVarSlot {
std::vector<real> data;
std::vector<int> dims;
};
struct ProtoSlot {
SlotDef::SlotType type;
int dim;
std::vector<int> indexData;
std::vector<real> denseData;
std::vector<sparse_non_value_t> sparseNonValueData;
std::vector<sparse_float_value_t> sparseFloatValueData;
std::vector<int64_t> indices;
std::vector<int64_t> subIndices;
std::vector<ProtoVarSlot> varDenseData;
std::vector<std::vector<int>> varIndices;
std::vector<std::string> strData;
};
DataHeader header_;
int numVecSlots_;
std::vector<ProtoSlot> slots_;
size_t sampleNums_;
/**
* The starting position of each sequence in samples.
* The last element should be num of samples.
* If empty, each sample is one sequence.
*/
std::vector<size_t> sequenceStartPositions_;
int64_t currentSequenceIndex_;
// The size should be the number of sequences.
std::vector<size_t> shuffledSequenceIds_;
ThreadLocalD<DataBatch> cpuBatch_;
ThreadLocalD<DataBatch> gpuBatch_;
RWLock lock_;
std::vector<StatPtr> nnzStats_; // stats for number of none-zeros entries
};
/**
* @brief Special use for Proto data: instances should contain sparse-non-value
* slots
* and label.
*
* @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
*/
class ProtoSequenceDataProvider : public ProtoDataProvider {
public:
ProtoSequenceDataProvider(const DataConfig& config,
bool useGpu,
bool loadDataAll = true);
~ProtoSequenceDataProvider() {}
virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
};
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "Layer.h"
#include "paddle/math/Matrix.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
/**
* @brief A layer for computing the dot product of two vectors.
* Input1: vector (batchSize * dim)
* Input2: vector (batchSize * dim)
* Output: a matrix: (batchSize * 1)
*/
class DotProdLayer : public Layer {
public:
explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
~DotProdLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
};
REGISTER_LAYER(dot_prod, DotProdLayer);
bool DotProdLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK_EQ(inputLayers_.size(), 2U);
CHECK_EQ(1UL, getSize())
<< "The output dimensionality of this layer should be fixed to 1.";
return true;
}
void DotProdLayer::forward(PassType passType) {
Layer::forward(passType);
MatrixPtr inV0 = getInputValue(0);
MatrixPtr inV1 = getInputValue(1);
size_t batchSize = inV0->getHeight();
CHECK_EQ(inV1->getHeight(), batchSize);
CHECK_EQ(inV0->getWidth(), inV1->getWidth());
{
REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
reserveOutput(batchSize, 1);
}
MatrixPtr outV = getOutputValue();
{
REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
outV->sumOfProducts(*inV0, *inV1, 1, 0);
}
}
void DotProdLayer::backward(const UpdateCallback& callback) {
MatrixPtr inV0 = getInputValue(0);
MatrixPtr inV1 = getInputValue(1);
MatrixPtr outG = getOutputGrad();
MatrixPtr inG0 = getInputGrad(0);
MatrixPtr inG1 = getInputGrad(1);
{
REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
if (inG0) {
inG0->addRowScale(0, *inV1, *outG);
}
if (inG1) {
inG1->addRowScale(0, *inV0, *outG);
}
}
}
} // namespace paddle
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MKLDNNConcatLayer.h"
using namespace mkldnn; // NOLINT
typedef memory::format format;
namespace paddle {
REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer);
bool MKLDNNConcatLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
if (!MKLDNNLayer::init(layerMap, parameterMap)) {
return false;
}
CHECK_GT(inputLayers_.size(), 1UL);
CHECK(!biasParameter_);
return true;
}
void MKLDNNConcatLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw);
ic = inputLayers_[0]->getSize() / ih / iw;
CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
CHECK_GT(inputLayers_.size(), 1UL);
channels_.resize(inputLayers_.size());
channels_[0] = ic;
// need change the output channel, so use oc_ instead
// TODO(TJ): change API, use &oc
oc_ = ic;
for (size_t i = 1; i < inputLayers_.size(); i++) {
int batchsize, height, witdh;
reshapeInput(batchsize, height, witdh, i);
CHECK_EQ(bs, batchsize);
CHECK_EQ(ih, height);
CHECK_EQ(iw, witdh);
channels_[i] = inputLayers_[i]->getSize() / height / witdh;
CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize());
oc_ += channels_[i];
}
oh = ih;
ow = iw;
reshapeOutput(oh, ow);
resizeOutput(bs, oc_ * oh * ow);
}
void MKLDNNConcatLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
resetFwdBuffers(inVals_, out);
in = inVals_[0];
std::shared_ptr<concat::primitive_desc> fwdPD;
resetFwdPD(fwdPD, inVals_, out);
resetFwdPipeline(pipeline, fwdPD, inVals_, out);
}
void MKLDNNConcatLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
resetBwdBuffers(inGrads_, out);
in = inGrads_[0];
resetBwdPipeline(pipeline, bwds_, inGrads_, out);
}
void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
inputs.resize(inputLayers_.size());
bool has8c = false, has16c = false, hasnc = false;
for (size_t i = 0; i < inputs.size(); i++) {
// resetInValue will use ic_ so temporary change as current input's channel
// TODO(TJ): change ic_ as vector then can remove channels_
ic_ = channels_[i];
resetInValue(inputs[i], nullptr, i);
CHECK(inputs[i]);
auto dm = inputs[i]->getDims();
// inputs format can be different, but ndims must equal
CHECK(i == 0 || dm.size() == inputs[0]->getDims().size());
CHECK_EQ(bs_, dm[0]);
CHECK_EQ(channels_[i], dm[1]);
if (dm.size() > 2) {
CHECK_EQ(ih_, dm[2]);
CHECK_EQ(iw_, dm[3]);
}
if (inputs[i]->getFormat() == format::nc) {
hasnc = true;
}
if (inputs[i]->getFormat() == format::nChw8c) {
has8c = true;
}
if (inputs[i]->getFormat() == format::nChw16c) {
has16c = true;
}
}
// change back, ic_ always save the input 0 size
ic_ = channels_[0];
format outFmt;
if (has16c && oc_ % 16 == 0) {
outFmt = format::nChw16c;
} else if (has8c && oc_ % 8 == 0) {
outFmt = format::nChw8c;
} else if (hasnc) {
CHECK(oh_ == 1 && ow_ == 1);
outFmt = format::nc;
} else {
outFmt = format::nchw;
}
memory::dims outDims =
hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_};
auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_);
resetOutValue(out, outPD);
}
void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr<concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr out) {
std::vector<memory::primitive_desc> srcPDs;
for (size_t i = 0; i < inputs.size(); i++) {
srcPDs.push_back(inputs[i]->getPrimitiveDesc());
}
CHECK(out);
pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs));
CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
}
void MKLDNNConcatLayer::resetFwdPipeline(
std::vector<primitive>& pipeline,
std::shared_ptr<concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
std::vector<primitive::at> srcs;
for (size_t i = 0; i < inputs.size(); i++) {
srcs.push_back(*(inputs[i]));
}
fwd_.reset(new concat(*pd, srcs, *out));
pipeline.push_back(*fwd_);
}
void MKLDNNConcatLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
CHECK(outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
CHECK(out);
inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) {
CHECK(inVals_[i]);
// resetInGrad will use inVal_
// TODO(TJ): change move inVals_ to MKLDNNLayer ans remove inVal_
inVal_ = inVals_[i];
resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc());
}
// change back, inVal_ always save the input 0
inVal_ = inVals_[0];
}
void MKLDNNConcatLayer::resetBwdPipeline(
std::vector<mkldnn::primitive>& pipeline,
std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
// reset the backward primitives
memory::dims offsets = {0, 0, 0, 0};
prims.resize(inputs.size());
CHECK_EQ(inputs.size(), channels_.size());
for (size_t i = 0; i < inputs.size(); i++) {
auto viewPD = view::primitive_desc(
out->getPrimitiveDesc(), inputs[i]->getDims(), offsets);
auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(),
inputs[i]->getPrimitiveDesc());
prims[i].reset(new reorder(bwdPD, *out, *(inputs[i])));
offsets[axis_] += channels_[i];
// push to pipeline
pipeline.push_back(*prims[i]);
}
}
} // namespace paddle
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "MKLDNNLayer.h"
#include "mkldnn.hpp"
namespace paddle {
/**
* @brief A subclass of MKLDNNLayer Concatenate layer.
*
* The config file api is mkldnn_concat
*/
class MKLDNNConcatLayer : public MKLDNNLayer {
protected:
std::vector<MKLDNNMatrixPtr> inVals_;
std::vector<MKLDNNMatrixPtr> inGrads_;
std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
// input channel numbers
std::vector<int> channels_;
// concat_dimension in MKLDNN
// if axis_ == 0, concat batchsize
// if axis_ == 1, concat channel (default)
int axis_;
public:
explicit MKLDNNConcatLayer(const LayerConfig& config)
: MKLDNNLayer(config), axis_(1) {}
~MKLDNNConcatLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void printSizeInfo() override {
CHECK_EQ(channels_.size(), inputLayers_.size());
for (size_t i = 0; i < channels_.size(); ++i) {
VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
<< ": " << bs_ << ", " << channels_[i] << ", " << ih_
<< ", " << iw_;
}
VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
<< ", " << ow_;
}
void printValueFormat() override {
for (size_t i = 0; i < inVals_.size(); ++i) {
VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
<< ": " << inVals_[i]->getFormat() << " >>>";
}
if (outVal_) {
VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
}
if (extOutVal_) {
VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
}
}
void printGradFormat() override {
if (extOutGrad_) {
VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
}
if (outGrad_) {
VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
}
for (size_t i = 0; i < inGrads_.size(); ++i) {
VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
<< ": " << inGrads_[i]->getFormat() << "<<<";
}
}
protected:
/**
* Forward functions: reset buffers(inputs, output, bias),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr out);
void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(inputs, output, bias)
* reset primitives and pipeline
*/
void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
};
} // namespace paddle
......@@ -21,8 +21,8 @@ namespace paddle {
bool MKLDNNLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
<< "Please set WITH_MKLDNN=ON "
CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn."
<< "Please set WITH_MKL=ON "
<< "and set use_mkldnn=True";
CHECK(!useGpu_) << "Do not support GPU yet";
......@@ -138,8 +138,11 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
}
}
void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) {
const Argument& input = inputLayers_[0]->getOutput();
void MKLDNNLayer::reshapeInput(int& batchsize,
int& height,
int& width,
size_t inputIdx) {
const Argument& input = inputLayers_[inputIdx]->getOutput();
batchsize = input.getBatchSize();
int h = input.getFrameHeight();
int w = input.getFrameWidth();
......
......@@ -178,7 +178,10 @@ protected:
/**
* reshape the input image sizes and input batchsize
*/
void reshapeInput(int& batchsize, int& height, int& width);
void reshapeInput(int& batchsize,
int& height,
int& width,
size_t inputIdx = 0);
/**
* reshape output image sizes
......
......@@ -29,7 +29,7 @@ gserver_test(test_KmaxSeqScore)
gserver_test(test_Expand)
gserver_test(test_MaxPoolingWithMaskOutput)
########## test_Mkldnn layers and activations ##########
########## test_MKLDNN layers and activations ##########
if(WITH_MKLDNN)
add_unittest_without_exec(test_MKLDNN
test_MKLDNN.cpp
......@@ -62,17 +62,6 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
endif()
if(NOT MOBILE_INFERENCE)
################### test_ProtoDataProvider ############
add_unittest_without_exec(test_ProtoDataProvider
test_ProtoDataProvider.cpp)
# test_ProtoDataProvider will mkdir as same name,
# so if WORKING_DIRECTORY is default directory, then
# mkdir will get error.
add_test(NAME test_ProtoDataProvider
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
################## test_Evaluator #######################
add_unittest(test_Evaluator
test_Evaluator.cpp)
......@@ -110,3 +99,24 @@ add_test(NAME test_PyDataProvider2
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
)
################# test_CompareSparse ##################
add_unittest_without_exec(test_CompareSparse
test_CompareSparse.cpp)
if(NOT ON_TRAVIS)
add_test(NAME test_CompareSparse
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
./.set_port.sh -p port -n 6
${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif()
################ test_CompareTwoNets ######################
add_unittest_without_exec(test_CompareTwoNets
test_CompareTwoNets.cpp)
add_test(NAME test_CompareTwoNets
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
......@@ -23,7 +23,7 @@ limitations under the License. */
namespace paddle {
/**
* @brief test the functionality of Mkldnnlayers
* @brief test the functionality of MKLDNNlayers and MKLDNNActivations
* refer to paddle original function
*/
class MKLDNNTester {
......
./test_ProtoDataProvider/data1.bin
./test_ProtoDataProvider/data2.bin
./test_ProtoDataProvider/data1.bin.gz
./test_ProtoDataProvider/data2.bin.gz
#!/usr/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -14,27 +15,50 @@
from paddle.trainer_config_helpers import *
################################### Data Configuration ###################################
TrainData(ProtoData(files = "trainer/tests/mnist.list"))
################################### Algorithm Configuration ###################################
settings(batch_size = 1000,
learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
################################### Network Configuration ###################################
data = data_layer(name ="input", size=784)
######################## data source ################################
dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
dict_file = dict()
for line_count, line in enumerate(open(dict_path, "r")):
dict_file[line.strip()] = line_count
fc1 = fc_layer(input=data, size=800,
bias_attr=True,
act=SigmoidActivation())
define_py_data_sources2(
train_list='gserver/tests/Sequence/train.list',
test_list=None,
module='sequenceGen',
obj='process',
args={"dict_file": dict_file})
fc2 = fc_layer(input=fc1, size=800,
bias_attr=True,
act=SigmoidActivation())
settings(batch_size=5)
######################## network configure ################################
dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128
hidden_dim = 256
label_dim = 3
sparse_update = get_config_arg("sparse_update", bool, False)
output = fc_layer(input=[fc1, fc2], size=10,
bias_attr=True,
act=SoftmaxActivation())
data = data_layer(name="word", size=dict_dim)
lbl = data_layer(name ="label", size=1)
emb = embedding_layer(
input=data,
size=word_dim,
param_attr=ParamAttr(sparse_update=sparse_update))
cost = classification_cost(input=output, label=lbl)
outputs(cost)
with mixed_layer(size=hidden_dim * 4) as lstm_input:
lstm_input += full_matrix_projection(input=emb)
lstm = lstmemory(
input=lstm_input,
act=TanhActivation(),
gate_act=SigmoidActivation(),
state_act=TanhActivation())
lstm_last = last_seq(input=lstm)
with mixed_layer(
size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
output += full_matrix_projection(input=lstm_last)
outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
#!/usr/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -14,27 +15,42 @@
from paddle.trainer_config_helpers import *
################################### Data Configuration ###################################
TrainData(ProtoData(files = "trainer/tests/mnist.list"))
################################### Algorithm Configuration ###################################
settings(batch_size = 1000,
learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
################################### Network Configuration ###################################
data = data_layer(name ="input", size=784)
######################## data source ################################
dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
dict_file = dict()
for line_count, line in enumerate(open(dict_path, "r")):
dict_file[line.strip()] = line_count
fc1 = fc_layer(input=data, size=800,
bias_attr=True,
act=SigmoidActivation())
define_py_data_sources2(
train_list='gserver/tests/Sequence/train.list',
test_list=None,
module='sequenceGen',
obj='process',
args={"dict_file": dict_file})
fc2 = fc_layer(input=fc1, size=800,
bias_attr=True,
act=SigmoidActivation())
settings(batch_size=5)
######################## network configure ################################
dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128
hidden_dim = 128
label_dim = 3
output = fc_layer(input=[fc1, fc2], size=10,
bias_attr=True,
act=SoftmaxActivation())
# This config is designed to be equivalent with sequence_recurrent_group.py
lbl = data_layer(name ="label", size=1)
data = data_layer(name="word", size=dict_dim)
cost = classification_cost(input=output, label=lbl)
outputs(cost)
emb = embedding_layer(
input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
recurrent_last = last_seq(input=recurrent)
with mixed_layer(
size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
output += full_matrix_projection(input=recurrent_last)
outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
#!/usr/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
######################## data source ################################
dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
dict_file = dict()
for line_count, line in enumerate(open(dict_path, "r")):
dict_file[line.strip()] = line_count
define_py_data_sources2(
train_list='gserver/tests/Sequence/train.list',
test_list=None,
module='sequenceGen',
obj='process',
args={"dict_file": dict_file})
settings(batch_size=5)
######################## network configure ################################
dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128
hidden_dim = 128
label_dim = 3
# This config is designed to be equivalent with sequence_recurrent.py
data = data_layer(name="word", size=dict_dim)
emb = embedding_layer(
input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
def step(y):
mem = memory(name="rnn_state", size=hidden_dim)
with mixed_layer(
name="rnn_state",
size=hidden_dim,
bias_attr=False,
act=SoftmaxActivation()) as out:
out += identity_projection(input=y)
out += full_matrix_projection(
input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
return out
recurrent = recurrent_group(name="rnn", step=step, input=emb)
recurrent_last = last_seq(input=recurrent)
with mixed_layer(
size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
output += full_matrix_projection(input=recurrent_last)
outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
......@@ -22,8 +22,7 @@ limitations under the License. */
using namespace paddle; // NOLINT
using namespace std; // NOLINT
static const string& configFile1 =
"trainer/tests/sample_trainer_config_compare_sparse.conf";
static const string& configFile1 = "gserver/tests/sequence_lstm.conf";
DECLARE_bool(use_gpu);
DECLARE_string(config);
......
......@@ -30,8 +30,6 @@ DECLARE_bool(use_gpu);
DECLARE_string(config);
DECLARE_string(nics);
DEFINE_string(config_file_a, "", "config of one network to compare");
DEFINE_string(config_file_b, "", "config of another network to compare");
DEFINE_bool(need_high_accuracy,
false,
"whether need to run in double accuracy");
......@@ -42,6 +40,10 @@ DEFINE_double(
DECLARE_bool(thread_local_rand_use_global_seed);
DECLARE_int32(seed);
static const string& config_file_a = "gserver/tests/sequence_recurrent.py";
static const string& config_file_b =
"gserver/tests/sequence_recurrent_group.py";
struct ComData {
vector<Argument> outArgs;
vector<ParameterPtr> parameters;
......@@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) {
DataBatch dataBatch;
int32_t batchSize = trainer.getConfig().opt_config().batch_size();
trainer.getDataProvider()->reset();
trainer.getDataProvider()->setSkipShuffle();
trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
......@@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
TEST(Trainer, create) {
ComData dataA;
calcGradient(dataA, FLAGS_config_file_a);
calcGradient(dataA, config_file_a);
LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
ComData dataB;
calcGradient(dataB, FLAGS_config_file_b);
calcGradient(dataB, config_file_b);
LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
compareGradient(dataA, dataB);
......
......@@ -1081,6 +1081,21 @@ TEST(Layer, InterpolationLayer) {
}
}
TEST(Layer, DotProdLayer) {
TestConfig config;
config.layerConfig.set_type("dot_prod");
config.layerConfig.set_size(1);
config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
config.layerConfig.add_inputs();
config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
testLayerGrad(config, "dot_prod", 10, false, useGpu);
}
}
TEST(Layer, OuterProdLayer) {
TestConfig config;
config.layerConfig.set_type("out_prod");
......
......@@ -313,6 +313,47 @@ TEST(MKLDNNLayer, AddtoLayer) {
testAddtoLayer({4, 12, 1, 1}, 3);
}
static void getMKLDNNConcatConfig(TestConfig& cfg,
const std::vector<testImageDesc>& inputs) {
CHECK_GE(inputs.size(), 2) << "at least two inputs";
int oc = inputs[0].ic;
for (size_t i = 1; i < inputs.size(); ++i) {
CHECK_EQ(inputs[i].bs, inputs[0].bs);
CHECK_EQ(inputs[i].ih, inputs[0].ih);
CHECK_EQ(inputs[i].iw, inputs[0].iw);
oc += inputs[i].ic;
}
cfg.biasSize = 0;
cfg.layerConfig.set_type("mkldnn_concat");
cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
cfg.layerConfig.set_active_type("relu");
for (size_t i = 0; i < inputs.size(); ++i) {
std::stringstream ss;
ss << "layer_" << i;
cfg.inputDefs.push_back(
{INPUT_DATA,
ss.str(),
(size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
0});
LayerInputConfig* input = cfg.layerConfig.add_inputs();
ImageConfig* img_conf = input->mutable_image_conf();
img_conf->set_channels(inputs[i].ic);
img_conf->set_img_size_y(inputs[i].ih);
img_conf->set_img_size(inputs[i].iw);
}
}
void testConcatLayer(const std::vector<testImageDesc>& inputs) {
TestConfig dnnConfig;
getMKLDNNConcatConfig(dnnConfig, inputs);
RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
}
TEST(MKLDNNLayer, ConcatLayer) {
testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
}
void testActivation(std::string actType, const testImageDesc& pm) {
// TODO(TJ): remove me when paddle support elu activation
if (actType == "mkldnn_elu") {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include <gtest/gtest.h>
#include "paddle/gserver/dataproviders/ProtoDataProvider.h"
#include "paddle/utils/Util.h"
#include "paddle/testing/TestUtil.h"
using namespace std; // NOLINT
std::vector<string> protoFiles{
"./test_ProtoDataProvider/data1.bin", "./test_ProtoDataProvider/data2.bin",
};
std::vector<string> protoFilesCompressed{
"./test_ProtoDataProvider/data1.bin.gz",
"./test_ProtoDataProvider/data2.bin.gz",
};
const char* kTestDir = "./test_ProtoDataProvider";
const char kProtoFileList[] = "gserver/tests/proto_files.txt";
const char kProtoFileListCompressed[] =
"gserver/tests/proto_files_compressed.txt";
const int kSpraseMatrixDim = 1024;
using namespace paddle; // NOLINT
void prepareData(DataBatch* batch,
const int* numPerSlotType,
bool iid,
bool useGpu) {
batch->clear();
int64_t size = uniformRandom(100) + 10;
batch->setSize(size);
ICpuGpuVectorPtr sequenceStartPositions;
ICpuGpuVectorPtr subSequenceStartPositions;
if (!iid) {
int numSeqs = uniformRandom(10) + 1;
sequenceStartPositions =
ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
int* buf = sequenceStartPositions->getMutableData(false);
subSequenceStartPositions =
ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
int* subBuf = subSequenceStartPositions->getMutableData(false);
int64_t pos = 0;
int maxLen = 2 * size / numSeqs;
for (int i = 0; i < numSeqs; ++i) {
int len =
uniformRandom(min<int64_t>(maxLen, size - pos - numSeqs + i)) + 1;
buf[i] = pos;
subBuf[i] = pos;
pos += len;
VLOG(1) << " len=" << len;
}
buf[numSeqs] = size;
subBuf[numSeqs] = size;
}
vector<Argument>& arguments = batch->getStreams();
for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_DENSE]; ++i) {
int64_t dim = rand() % 10 + 4; // NOLINT rand_r
MatrixPtr mat = Matrix::create(size, dim, /* trans= */ false, false);
mat->randomizeUniform();
Argument arg;
arg.value = mat;
arg.sequenceStartPositions = sequenceStartPositions;
arguments.push_back(arg);
}
for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE]; ++i) {
MatrixPtr mat =
makeRandomSparseMatrix(size, kSpraseMatrixDim, false, useGpu);
Argument arg;
arg.value = mat;
arg.sequenceStartPositions = sequenceStartPositions;
arg.subSequenceStartPositions = subSequenceStartPositions;
arguments.push_back(arg);
}
for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE]; ++i) {
MatrixPtr mat =
makeRandomSparseMatrix(size, kSpraseMatrixDim, true, useGpu);
Argument arg;
arg.value = mat;
arg.sequenceStartPositions = sequenceStartPositions;
arguments.push_back(arg);
}
for (int i = 0; i < numPerSlotType[SlotDef::STRING]; ++i) {
int64_t dim = rand() % 10 + 4; // NOLINT rand_r
SVectorPtr vec = std::make_shared<std::vector<std::string>>();
for (int j = 0; j < size; ++j) {
vec->push_back(randStr(dim));
}
Argument arg;
arg.strs = vec;
arg.sequenceStartPositions = sequenceStartPositions;
arguments.push_back(arg);
}
for (int i = 0; i < numPerSlotType[SlotDef::INDEX]; ++i) {
int64_t dim = rand() % 10 + 4; // NOLINT rand_r
IVectorPtr vec = IVector::create(size, /* useGpu= */ false);
int* buf = vec->getData();
for (int j = 0; j < size; ++j) {
buf[j] = uniformRandom(dim);
}
Argument arg;
arg.ids = vec;
arg.sequenceStartPositions = sequenceStartPositions;
arguments.push_back(arg);
}
}
inline int getSlotDim(const Argument& arg) {
if (arg.value) {
return arg.value->getWidth();
} else if (arg.ids) {
return arg.ids->getMax() + 1;
} else if (arg.strs) {
return 1;
}
LOG(FATAL) << "Invalid argument";
return 0;
}
inline SlotDef::SlotType getSlotType(const Argument& arg) {
if (arg.value) {
auto& m = *arg.value;
auto& type = typeid(m);
if (type == typeid(CpuMatrix) || type == typeid(GpuMatrix)) {
return SlotDef::VECTOR_DENSE;
}
if (type == typeid(CpuSparseMatrix)) {
auto valueType =
std::dynamic_pointer_cast<CpuSparseMatrix>(arg.value)->getValueType();
if (NO_VALUE == valueType) {
return SlotDef::VECTOR_SPARSE_NON_VALUE;
} else {
return SlotDef::VECTOR_SPARSE_VALUE;
}
}
if (type == typeid(GpuSparseMatrix)) {
auto valueType =
std::dynamic_pointer_cast<GpuSparseMatrix>(arg.value)->getValueType();
if (NO_VALUE == valueType) {
return SlotDef::VECTOR_SPARSE_NON_VALUE;
} else {
return SlotDef::VECTOR_SPARSE_VALUE;
}
}
LOG(FATAL) << "Unknown matrix type";
}
if (arg.ids) return SlotDef::INDEX;
if (arg.strs) return SlotDef::STRING;
LOG(FATAL) << "Invalid argument";
return SlotDef::VECTOR_DENSE;
}
void getColRow(const Argument& arg,
int64_t pos,
bool useGpu,
int* colNum,
const int** rowCols,
const real** rowValues) {
SlotDef::SlotType type = getSlotType(arg);
GpuSparseMatrixPtr matGpu;
CpuSparseMatrixPtr matCpu;
if (useGpu) {
matGpu = dynamic_pointer_cast<GpuSparseMatrix>(arg.value);
ASSERT_TRUE(matGpu != NULL);
} else {
matCpu = dynamic_pointer_cast<CpuSparseMatrix>(arg.value);
ASSERT_TRUE(matCpu != NULL);
}
*colNum = useGpu ? matGpu->getColNum(pos) : matCpu->getColNum(pos);
*rowCols = useGpu ? matGpu->getRowCols(pos) : matCpu->getRowCols(pos);
if (type == SlotDef::VECTOR_SPARSE_VALUE) {
*rowValues = useGpu ? matGpu->getRowValues(pos) : matCpu->getRowValues(pos);
} else {
*rowValues = NULL;
}
}
void makeSample(const vector<Argument>& arguments,
int64_t pos,
bool isBeginning,
DataSample* sample,
bool useGpu) {
sample->set_is_beginning(isBeginning);
int slotid = 0;
for (auto& arg : arguments) {
SlotDef::SlotType type = getSlotType(arg);
int64_t dim = getSlotDim(arg);
switch (type) {
case SlotDef::VECTOR_DENSE: {
VectorSlot* vecSlot = sample->add_vector_slots();
auto values = vecSlot->mutable_values();
values->Reserve(dim);
for (int i = 0; i < dim; ++i) {
values->AddAlreadyReserved(
static_cast<float>(arg.value->getElement(pos, i)));
}
break;
}
case SlotDef::INDEX: {
sample->add_id_slots(arg.ids->get(pos));
break;
}
case SlotDef::VECTOR_SPARSE_NON_VALUE: {
VectorSlot* vecSlot = sample->add_vector_slots();
auto ids = vecSlot->mutable_ids();
int colNum;
const int* rowCols;
const real* rowValues; // nullptr
getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
ids->Reserve(colNum);
for (int i = 0; i < colNum; ++i) {
ids->AddAlreadyReserved(rowCols[i]);
}
SubseqSlot* subseqSlot = sample->add_subseq_slots(); // subseq
subseqSlot->set_slot_id(slotid);
auto lens = subseqSlot->mutable_lens();
lens->Add(colNum);
break;
}
case SlotDef::VECTOR_SPARSE_VALUE: {
VectorSlot* vecSlot = sample->add_vector_slots();
auto values = vecSlot->mutable_values();
auto ids = vecSlot->mutable_ids();
int colNum;
const int* rowCols;
const real* rowValues;
getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
ids->Reserve(colNum);
values->Reserve(colNum);
for (int i = 0; i < colNum; ++i) {
ids->AddAlreadyReserved(rowCols[i]);
values->AddAlreadyReserved(rowValues[i]);
}
break;
}
case SlotDef::VAR_MDIM_DENSE:
case SlotDef::VAR_MDIM_INDEX: {
LOG(FATAL) << "Not implemented";
break;
}
case SlotDef::STRING: {
VectorSlot* vecSlot = sample->add_vector_slots();
vecSlot->add_strs((*arg.strs)[pos]);
break;
}
}
slotid++;
}
}
void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
DataHeader header;
const vector<Argument>& arguments = batch.getStreams();
for (auto& argument : arguments) {
SlotDef* slotDef = header.add_slot_defs();
slotDef->set_type(getSlotType(argument));
slotDef->set_dim(getSlotDim(argument));
}
VLOG(1) << "header=" << header.DebugString();
int64_t totalSeqs = batch.getNumSequences();
int64_t seq = 0;
ICpuGpuVectorPtr sequenceStartPositions = arguments[0].sequenceStartPositions;
int64_t numWritten = 0;
vector<string> curProtoFiles =
dataCompression ? protoFilesCompressed : protoFiles;
for (size_t i = 0; i < curProtoFiles.size(); ++i) {
int64_t numSeqs = totalSeqs * (i + 1) / curProtoFiles.size() -
totalSeqs * i / curProtoFiles.size();
ofstream os(curProtoFiles[i]);
CHECK(os) << "Fail to open " << curProtoFiles[i];
unique_ptr<ProtoWriter> writer(new ProtoWriter(&os, dataCompression));
CHECK(writer->write(header));
for (int j = 0; j < numSeqs; ++j, ++seq) {
int64_t begin = seq;
int64_t end = seq + 1;
if (sequenceStartPositions) {
begin = sequenceStartPositions->getElement(seq);
end = sequenceStartPositions->getElement(seq + 1);
}
for (int pos = begin; pos < end; ++pos) {
DataSample sample;
makeSample(arguments, pos, pos == begin, &sample, useGpu);
CHECK(writer->write(sample));
++numWritten;
}
}
writer.reset(nullptr);
os.close();
}
CHECK_EQ(arguments[0].getBatchSize(), numWritten);
}
// check that the sample at pos1 in args1 is same as the sample at pos2 in args2
void checkSample(const vector<Argument>& args1,
int64_t pos1,
const vector<Argument>& args2,
int64_t pos2,
bool useGpu) {
EXPECT_EQ(args1.size(), args2.size());
VLOG(1) << " pos1=" << pos1 << " pos2=" << pos2;
for (size_t i = 0; i < args1.size(); ++i) {
auto type = getSlotType(args1[i]);
int dim = getSlotDim(args1[i]);
EXPECT_EQ(type, getSlotType(args2[i]));
if (type == SlotDef::INDEX) {
EXPECT_GE(dim, getSlotDim(args2[i]));
} else {
EXPECT_EQ(dim, getSlotDim(args2[i]));
}
switch (type) {
case SlotDef::VECTOR_DENSE: {
for (int j = 0; j < dim; ++j) {
EXPECT_EQ(static_cast<float>(args1[i].value->getElement(pos1, j)),
static_cast<float>(args2[i].value->getElement(pos2, j)));
}
break;
}
case SlotDef::INDEX: {
EXPECT_EQ(args1[i].ids->get(pos1), args2[i].ids->get(pos2));
break;
}
case SlotDef::VECTOR_SPARSE_NON_VALUE:
case SlotDef::VECTOR_SPARSE_VALUE: {
int colNum1, colNum2;
const int *rowCols1, *rowCols2;
const real *rowValues1, *rowValues2;
getColRow(args1[i], pos1, useGpu, &colNum1, &rowCols1, &rowValues1);
getColRow(args2[i], pos2, useGpu, &colNum2, &rowCols2, &rowValues2);
EXPECT_EQ(colNum1, colNum2);
for (int j = 0; j < colNum1; ++j) {
EXPECT_EQ(rowCols1[j], rowCols2[j]);
if (type == SlotDef::VECTOR_SPARSE_VALUE) {
EXPECT_EQ(rowValues1[j], rowValues2[j]);
}
}
break;
}
case SlotDef::VAR_MDIM_DENSE:
case SlotDef::VAR_MDIM_INDEX: {
LOG(FATAL) << "Not implemented";
break;
}
case SlotDef::STRING: {
EXPECT_EQ((*args1[i].strs)[pos1], (*args2[i].strs)[pos2]);
break;
}
}
}
}
void testProtoDataProvider(int* numPerSlotType,
bool iid,
bool async,
bool useGpu,
bool dataCompression,
int numConstantSlots = 0) {
mkDir(kTestDir);
DataBatch data;
prepareData(&data, numPerSlotType, iid, useGpu);
writeData(data, useGpu, dataCompression);
DataConfig config;
config.set_type("proto");
config.set_files(dataCompression ? kProtoFileListCompressed : kProtoFileList);
config.set_async_load_data(async);
for (int i = 0; i < numConstantSlots; ++i) {
config.add_constant_slots(i + 11);
MatrixPtr w = Matrix::create(data.getSize(),
1,
/* trans= */ false,
/* useGpu= */ false);
w->assign(config.constant_slots(i));
data.appendData(w);
}
unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
dataProvider->setSkipShuffle();
EXPECT_EQ(data.getSize(), dataProvider->getSize());
int64_t batchSize = 10;
DataBatch batch;
size_t seq1 = 0;
vector<Argument>& args1 = data.getStreams();
ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
dataProvider->reset();
while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
vector<Argument>& args2 = batch.getStreams();
ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
for (auto& arg : args2) {
EXPECT_EQ(iid, !arg.sequenceStartPositions);
}
size_t numSeqs = batch.getNumSequences();
VLOG(1) << "numSeqs=" << numSeqs;
for (size_t seq2 = 0; seq2 < numSeqs; ++seq1, ++seq2) {
int64_t begin1 = seq1;
int64_t end1 = seq1 + 1;
if (sequenceStartPositions1) {
begin1 = sequenceStartPositions1->getElement(seq1);
end1 = sequenceStartPositions1->getElement(seq1 + 1);
EXPECT_LT(seq1, sequenceStartPositions1->getSize() - 1);
}
int64_t begin2 = seq2;
int64_t end2 = seq2 + 1;
if (sequenceStartPositions2) {
begin2 = sequenceStartPositions2->getElement(seq2);
end2 = sequenceStartPositions2->getElement(seq2 + 1);
}
VLOG(1) << " begin1=" << begin1 << " end1=" << end1
<< " begin2=" << begin2 << " end2=" << end2;
EXPECT_EQ(end1 - begin1, end2 - begin2);
for (int i = 0; i < end1 - begin1; ++i) {
checkSample(args1, begin1 + i, args2, begin2 + i, useGpu);
}
}
}
EXPECT_EQ(seq1, (size_t)data.getNumSequences());
rmDir(kTestDir);
}
TEST(ProtoDataProvider, test) {
int numSlotsArray[] = {0, 3};
int numTwoArray[] = {0, 1};
int numSlotsArraySize = sizeof(numSlotsArray) / sizeof(numSlotsArray[0]);
const int numSlot = 5;
int combination[numSlot] = {0};
int k = numSlot - 1;
while (k >= 0) {
int numDenseVecSlots = numSlotsArray[combination[0]];
int numSparseNonValueVecSlots = numSlotsArray[combination[1]];
int numSparseValueVectorSlots = numSlotsArray[combination[2]];
int numStrSlots = numSlotsArray[combination[3]];
int numIdSlots = numSlotsArray[combination[4]];
// while loop : traverse all cases
k = numSlot - 1;
while (k >= 0) {
if (combination[k] < (numSlotsArraySize - 1)) {
++combination[k];
break;
} else {
combination[k] = 0;
--k;
}
}
if (numDenseVecSlots + numSparseNonValueVecSlots +
numSparseValueVectorSlots + numStrSlots + numIdSlots <
1)
continue;
for (int iid : numTwoArray) {
for (int async : numTwoArray) {
for (int useGpu : numTwoArray) {
for (int dataCompression : numTwoArray) {
if (async && useGpu) {
// Currently in async mode, useGpu is not supported
continue;
}
#ifndef PADDLE_WITH_CUDA
if (useGpu) {
continue;
}
#endif
LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
<< " numSparseNonValueVecSlots="
<< numSparseNonValueVecSlots
<< " numSparseValueVectorSlots="
<< numSparseValueVectorSlots
<< " numStrSlots=" << numStrSlots
<< " numIdSlots=" << numIdSlots << " iid=" << iid
<< " async=" << async << " useGpu=" << useGpu
<< " dataCompression=" << dataCompression;
int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
numSparseNonValueVecSlots;
numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] =
numSparseValueVectorSlots;
numPerSlotType[SlotDef::INDEX] = numIdSlots;
numPerSlotType[SlotDef::STRING] = numStrSlots;
testProtoDataProvider(
numPerSlotType, iid, async, useGpu, dataCompression);
} // end for (int dataCompression : numTwoArray)
} // end for (int useGpu : numTwoArray)
} // end for (int async : numTwoArray)
} // end for (int iid : numTwoArray)
} // end for (while, traverse all slots)
}
TEST(ProtoDataProvider, constant_slots) {
int numSlotsArray[] = {0, 3};
int numTwoArray[] = {0, 1};
for (int numDenseVecSlots : numSlotsArray) {
for (int numSparseNonValueVecSlots : numSlotsArray) {
if (numDenseVecSlots + numSparseNonValueVecSlots < 1) continue;
for (int numConstantSlots : {1, 2}) {
for (int useGpu : numTwoArray) {
for (int dataCompression : numTwoArray) {
#ifndef PADDLE_WITH_CUDA
if (useGpu) {
continue;
}
#endif
LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
<< " numSparseNonValueVecSlots="
<< numSparseNonValueVecSlots
<< " numConstantSlogs=" << numConstantSlots
<< " useGpu=" << useGpu
<< " dataCompression=" << dataCompression;
int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
numSparseNonValueVecSlots;
numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] = 1;
numPerSlotType[SlotDef::INDEX] = 1;
testProtoDataProvider(numPerSlotType,
/* iid= */ true,
/* async= */ false,
useGpu,
dataCompression,
numConstantSlots);
} // end for (int dataCompression : numTwoArray)
} // end for (int useGpu : numTwoArray)
} // end for (int numConstantSlots : {1, 2})
} // end for (int numSparseNonValueVecSlots : numSlotsArray)
} // end for (int numDenseVecSlots : numSlotsArray)
}
void checkSampleSequence(const vector<Argument>& args1,
const vector<Argument>& args2,
int64_t offset,
int64_t numSeqs,
bool useGpu) {
// check slot num are equal
EXPECT_EQ(args1.size(), args2.size());
for (size_t i = 0; i < args1.size(); i++) {
auto type = getSlotType(args1[i]);
// check for args2: sequenceStartPositions vs numSeqs
// (1) size
EXPECT_EQ(args2[i].sequenceStartPositions->getSize(), (size_t)numSeqs + 1);
// (2) content
auto checkArgContent = [&](const Argument& args, int numSeqs) {
for (int j = 0; j <= numSeqs; j++) {
int start_pos = args.sequenceStartPositions->getElement(j);
EXPECT_EQ(start_pos, j);
}
};
switch (type) {
case SlotDef::INDEX: {
// args1: for label
checkArgContent(args2[i], numSeqs);
// check for args2: ids are equal to args1[offset]
// (1) size
EXPECT_EQ(args2[i].ids->getSize(), (size_t)numSeqs);
// (2) content
for (int j = 0; j < numSeqs; j++) {
EXPECT_EQ(args2[i].ids->get(j), args1[i].ids->get(offset + j));
}
break;
}
case SlotDef::VECTOR_SPARSE_NON_VALUE: {
// args1: for sparse_non_value
// args2 should put sparse indexes in ids
int colNum1;
const int* rowCols1;
const real* rowValues1; // nullptr
int totalLength = 0;
for (int j = 0; j < numSeqs; j++) {
getColRow(
args1[i], offset + j, useGpu, &colNum1, &rowCols1, &rowValues1);
// (1) lengths
EXPECT_EQ(totalLength,
args2[i].sequenceStartPositions->getElement(j));
EXPECT_EQ(totalLength,
args2[i].subSequenceStartPositions->getElement(j));
// (2) content
for (int k = 0; k < colNum1; k++) {
EXPECT_EQ(rowCols1[k], args2[i].ids->get(totalLength + k));
}
totalLength += colNum1;
if (colNum1 == 0) {
// special case here: we will put a "-1" into ids when column num is
// zero. see ProtoSequenceDataProvider::getNextBatchInternal.
EXPECT_EQ(-1, args2[i].ids->get(totalLength));
totalLength++;
}
}
EXPECT_EQ(totalLength,
args2[i].sequenceStartPositions->getElement(numSeqs));
EXPECT_EQ(totalLength,
args2[i].subSequenceStartPositions->getElement(numSeqs));
break;
}
case SlotDef::VECTOR_DENSE: {
// args1: for dense vector
checkArgContent(args2[i], numSeqs);
// check for args2: values are equal to args1[offset]
// (1) size
EXPECT_EQ(args2[i].value->getHeight(), (size_t)numSeqs);
EXPECT_EQ(args2[i].value->getWidth(), (size_t)getSlotDim(args1[i]));
// (2) content
for (int j = 0; j < numSeqs; j++) {
for (size_t k = 0; k < args2[i].value->getWidth(); k++) {
EXPECT_EQ(
static_cast<float>(args1[i].value->getElement(j + offset, k)),
static_cast<float>(args2[i].value->getElement(j, k)));
}
}
break;
}
default: { EXPECT_EQ(true, false) << "should not reach here"; }
}
}
}
void testProtoSequenceDataProvider(int* numPerSlotType,
bool async,
bool useGpu) {
mkDir(kTestDir);
DataBatch data;
prepareData(&data,
numPerSlotType,
/* iid */ true,
useGpu);
writeData(data, useGpu, /* dataCompression */ false);
DataConfig config;
config.set_type("proto_sequence");
config.set_files(kProtoFileList);
config.set_async_load_data(async);
unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
dataProvider->setSkipShuffle();
EXPECT_EQ(data.getSize(), dataProvider->getSize());
int64_t batchSize = 10;
DataBatch batch;
vector<Argument>& args1 = data.getStreams();
ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
dataProvider->reset();
size_t args1Offset = 0;
while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
vector<Argument>& args2 = batch.getStreams();
ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
for (auto& arg : args1) {
// args1 should not has sequence
EXPECT_EQ(true, !arg.sequenceStartPositions);
}
for (auto& arg : args2) {
// args2 should has sequence
EXPECT_NE(true, !arg.sequenceStartPositions);
}
size_t numSeqs = batch.getNumSequences();
checkSampleSequence(args1, args2, args1Offset, numSeqs, useGpu);
args1Offset += numSeqs;
}
EXPECT_EQ(args1Offset, (size_t)data.getNumSequences());
rmDir(kTestDir);
}
TEST(ProtoSequenceDataProvider, test) {
int numSlotsArray[] = {0, 3};
int numTwoArray[] = {0, 1};
for (int numSparseNonValueVecSlots : numSlotsArray) {
for (int numIdSlots : numSlotsArray) {
for (int numDenseVecSlots : numSlotsArray) {
if (numDenseVecSlots + numSparseNonValueVecSlots + numIdSlots < 1)
continue;
for (int async : numTwoArray) {
for (int useGpu : numTwoArray) {
if (async && useGpu) {
// Currently in async mode, useGpu is not supported
continue;
}
#ifndef PADDLE_WITH_CUDA
if (useGpu) {
continue;
}
#endif
LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
<< " numSparseNonValueVecSlots="
<< numSparseNonValueVecSlots
<< " numIdSlots=" << numIdSlots << " async=" << async
<< " useGpu=" << useGpu;
int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
numSparseNonValueVecSlots;
numPerSlotType[SlotDef::INDEX] = numIdSlots;
testProtoSequenceDataProvider(numPerSlotType, async, useGpu);
} // end for (int useGpu : numTwoArray)
} // end for (int async : numTwoArray)
} // end for (int numDenseVecSlots : numSlotsArray)
} // end for (int numIdSlots : numSlotsArray)
} // end for (int numSparseNonValueVecSlots : numSlotsArray)
}
......@@ -17,9 +17,13 @@ limitations under the License. */
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h"
#ifndef PADDLE_MOBILE_INFERENCE
DEFINE_int32(pool_limit_size,
536870912,
"maximum memory size managed by a memory pool, default is 512M");
#else
DEFINE_int32(pool_limit_size, 0, "default is 0");
#endif
namespace paddle {
......
......@@ -61,6 +61,18 @@ function(op_library TARGET)
set(pybind_flag 1)
endif()
if ("${TARGET}" STREQUAL "compare_op")
set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
endif()
# conv_op contains several operators
if ("${TARGET}" STREQUAL "conv_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
endif()
# pool_op contains several operators
if ("${TARGET}" STREQUAL "pool_op")
set(pybind_flag 1)
......@@ -68,9 +80,11 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
endif()
if ("${TARGET}" STREQUAL "compare_op")
# pool_cudnn_op contains several operators
if ("${TARGET}" STREQUAL "pool_cudnn_op")
set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
endif()
# pool_with_index_op contains several operators
......@@ -80,25 +94,18 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
endif()
# conv_op contains several operators
if ("${TARGET}" STREQUAL "conv_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
endif()
# conv_transpose_op contains several operators
if ("${TARGET}" STREQUAL "conv_transpose_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
endif()
# pool_cudnn_op contains several operators
if ("${TARGET}" STREQUAL "pool_cudnn_op")
# conv_transpose_cudnn_op contains two operators
if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
endif()
# save_restore_op contains several operators
......
......@@ -226,9 +226,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
T alpha = 1.0f, beta = 0.0f;
if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*input_grad);
t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
t.constant(static_cast<T>(0));
// Because beta is zero, it is unnecessary to reset input_grad.
for (int i = 0; i < groups; i++) {
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc,
......@@ -241,9 +240,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv backward filter ---------------------
if (filter_grad) {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*filter_grad);
t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
t.constant(static_cast<T>(0));
// Because beta is zero, it is unnecessary to reset filter_grad.
for (int i = 0; i < groups; i++) {
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
......
......@@ -225,11 +225,15 @@ REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
ops::ConvOpGrad);
REGISTER_OP_CPU_KERNEL(conv2d,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(conv3d,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
......@@ -17,11 +17,15 @@
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(conv2d,
ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(
conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(conv3d,
ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(
conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
......@@ -23,7 +23,24 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
framework::OpAttrChecker* op_checker)
: Conv2DTransposeOpMaker(proto, op_checker) {
AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
.SetDefault(std::vector<int>{1, 1});
.SetDefault({1, 1});
AddAttr<int>("workspace_size_MB",
"workspace size for cudnn, in MB, "
"workspace is a section of GPU memory which will be "
"allocated/freed each time the operator runs, larger "
"workspace size can increase performance but also requires "
"better hardward. This size should be carefully setted.")
.SetDefault(4096);
}
};
class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker {
public:
CudnnConv3DTransposeOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: Conv3DTransposeOpMaker(proto, op_checker) {
AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
.SetDefault({1, 1, 1});
AddAttr<int>("workspace_size_MB",
"workspace size for cudnn, in MB, "
"workspace is a section of GPU memory which will be "
......@@ -48,3 +65,14 @@ REGISTER_OP_CPU_KERNEL(
REGISTER_OP_CPU_KERNEL(
conv2d_transpose_cudnn_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp,
ops::CudnnConv3DTransposeOpMaker, conv3d_transpose_cudnn_grad,
ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL(
conv3d_transpose_cudnn,
ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
conv3d_transpose_cudnn_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
......@@ -54,15 +54,21 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
ScopedTensorDescriptor output_desc;
ScopedFilterDescriptor filter_desc;
ScopedConvolutionDescriptor conv_desc;
DataLayout layout = DataLayout::kNCHW;
DataLayout layout;
if (strides.size() == 2U) {
layout = DataLayout::kNCHW;
} else {
layout = DataLayout::kNCDHW;
}
// N, M, H, W
// (N, M, H, W) or (N, M, D, H, W)
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()));
// N, C, O_h, O_w
// (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
layout, framework::vectorize2int(output->dims()));
// M, C, K_h, K_w
// (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
layout, framework::vectorize2int(filter->dims()));
cudnnConvolutionDescriptor_t cudnn_conv_desc =
......@@ -136,13 +142,13 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
ScopedConvolutionDescriptor conv_desc;
DataLayout layout = DataLayout::kNCHW;
// Input: (N, M, H, W)
// Input: (N, M, H, W) or (N, M, D, H, W)
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()));
// Output: (N, C, O_H, O_W)
// Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
layout, framework::vectorize2int(output_grad->dims()));
// Filter (M, C, K_H, K_W)
// Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
layout, framework::vectorize2int(filter->dims()));
......@@ -200,8 +206,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
T alpha = 1.0f, beta = 0.0f;
if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
math::set_constant(ctx.device_context(), input_grad, 0);
// Because beta is zero, it is unnecessary to reset input_grad.
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_output_desc, output_grad_data,
cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
......@@ -212,8 +217,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv backward filter ---------------------
if (filter_grad) {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
math::set_constant(ctx.device_context(), filter_grad, 0);
// Because beta is zero, it is unnecessary to reset filter_grad.
// Gradient with respect to the filter
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
......@@ -234,3 +238,8 @@ REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn,
ops::CudnnConvTransposeOpKernel<float>);
REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad,
ops::CudnnConvTransposeGradOpKernel<float>);
REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn,
ops::CudnnConvTransposeOpKernel<float>);
REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn_grad,
ops::CudnnConvTransposeGradOpKernel<float>);
......@@ -30,11 +30,6 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
for (size_t i = 0; i < paddings.size(); ++i) {
PADDLE_ENFORCE_EQ(paddings[i], 0,
"No Padding allowed in conv transpose op.");
}
PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
"ConvTransposeOp intput should be 4-D or 5-D tensor.");
PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
......@@ -52,7 +47,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back((in_dims[i + 2] - 1) * strides[i] +
output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
filter_dims[i + 2]);
}
ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
......@@ -190,17 +185,21 @@ REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
REGISTER_OP_CPU_KERNEL(
conv2d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
conv2d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
conv3d_transpose_grad, ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL(
conv3d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
conv3d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
......@@ -18,14 +18,18 @@ namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
conv2d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(
conv2d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(
conv3d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(
conv3d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
......@@ -62,7 +62,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
Tensor* output = context.Output<Tensor>("Output");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
// Actually, no paddings and groups allowed in conv transpose.
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
// TODO(Zhuoyuan): Paddings can be added in future.
// groups will alway be disabled in conv2dtranspose.
......@@ -148,8 +147,8 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
} else if (filter_shape_vec.size() == 3) {
// col2vol: col_matrix -> dy
// from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
col2vol(context.device_context(), col, dilations, strides,
std::vector<int>{0, 0, 0}, &output_batch);
col2vol(context.device_context(), col, dilations, strides, paddings,
&output_batch);
}
}
}
......@@ -173,7 +172,6 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
if ((!input_grad) && (!filter_grad)) return;
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
// Actually, no paddings and groups allowed in conv transpose.
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
const int batch_size = static_cast<int>(input->dims()[0]);
......
......@@ -24,8 +24,17 @@
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor;
template <typename Place, typename T>
inline void ReorderInitState(const platform::DeviceContext& ctx,
const framework::Tensor& src, const size_t* index,
framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<Place, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace());
row_shuffle(ctx, src, index, *dst, indexed_src);
}
template <typename Place, typename T>
class GRUKernel : public framework::OpKernel<T> {
......@@ -33,7 +42,6 @@ class GRUKernel : public framework::OpKernel<T> {
void BatchCompute(const framework::ExecutionContext& context) const {
auto* input = context.Input<LoDTensor>("Input");
auto* h0 = context.Input<Tensor>("H0");
const T* h0_data = h0 ? h0->data<T>() : nullptr;
auto* weight = context.Input<Tensor>("Weight");
const T* weight_data = weight->data<T>();
auto* bias = context.Input<Tensor>("Bias");
......@@ -66,7 +74,18 @@ class GRUKernel : public framework::OpKernel<T> {
gru_value.gateWeight = const_cast<T*>(weight_data);
gru_value.stateWeight =
const_cast<T*>(weight_data + 2 * frame_size * frame_size);
gru_value.prevOutValue = const_cast<T*>(h0_data);
Tensor ordered_h0;
const size_t* order = batch_gate->lod()[2].data();
if (h0) {
// Since the batch computing for GRU reorders the input sequences
// according to their length. The initialized cell state also needs
// to reorder.
ReorderInitState<Place, T>(context.device_context(), *h0, order,
&ordered_h0, true);
gru_value.prevOutValue = ordered_h0.data<T>();
} else {
gru_value.prevOutValue = nullptr;
}
auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1;
for (size_t n = 0; n < num_batch; n++) {
......@@ -102,7 +121,6 @@ class GRUGradKernel : public framework::OpKernel<T> {
public:
void BatchCompute(const framework::ExecutionContext& context) const {
auto* h0 = context.Input<Tensor>("H0");
const T* h0_data = h0 ? h0->data<T>() : nullptr;
auto* weight = context.Input<Tensor>("Weight");
const T* weight_data = weight->data<T>();
auto* batch_gate = context.Input<LoDTensor>("BatchGate");
......@@ -135,6 +153,17 @@ class GRUGradKernel : public framework::OpKernel<T> {
zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
Tensor ordered_h0, ordered_h0_grad;
const size_t* order = batch_gate->lod()[2].data();
if (h0) {
ReorderInitState<Place, T>(context.device_context(), *h0, order,
&ordered_h0, true);
}
if (h0_grad) {
ordered_h0_grad.mutable_data<T>(h0_grad->dims(), context.GetPlace());
zero(context.device_context(), &ordered_h0_grad, static_cast<T>(0.0));
}
bool is_reverse = context.Attr<bool>("is_reverse");
batch_hidden_grad.set_lod(batch_hidden->lod());
to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
......@@ -176,14 +205,9 @@ class GRUGradKernel : public framework::OpKernel<T> {
batch_reset_hidden_prev_grad.Slice(bstart, bend);
gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
if (n == 0) {
gru_value.prevOutValue = const_cast<T*>(h0_data);
if (h0_grad) {
T* h0_grad_data = h0_grad->mutable_data<T>(context.GetPlace());
zero(dev_ctx, h0_grad, static_cast<T>(0.0));
gru_grad.prevOutGrad = h0_grad_data;
} else {
gru_grad.prevOutGrad = nullptr;
}
gru_value.prevOutValue = h0 ? ordered_h0.data<T>() : nullptr;
gru_grad.prevOutGrad =
h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
} else {
int bstart_pre = static_cast<int>(batch_starts[n - 1]);
Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
......@@ -208,6 +232,10 @@ class GRUGradKernel : public framework::OpKernel<T> {
math::ColwiseSum<Place, T> col_sum;
col_sum(dev_ctx, batch_gate_grad, bias_grad);
}
if (h0 && h0_grad) {
ReorderInitState<Place, T>(context.device_context(), ordered_h0_grad,
order, h0_grad, false);
}
}
void Compute(const framework::ExecutionContext& context) const override {
......
add_subdirectory(detail)
if(WITH_GPU)
nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context)
nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto)
nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function)
nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
......@@ -15,7 +15,7 @@ if(WITH_GPU)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
else()
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context)
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
cc_library(softmax SRCS softmax.cc DEPS device_context)
cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
......
......@@ -119,8 +119,8 @@ __global__ void col2im(int n, const T* data_col, int im_height, int im_width,
if (index < n) {
T val = 0;
int w = index % im_width;
int h = (index / im_width) % im_height;
int w = index % im_width + padding_width;
int h = (index / im_width) % im_height + padding_height;
int c = index / (im_width * im_height);
// compute the start and end of the output
......
......@@ -135,8 +135,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
if (input_grad) {
T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
math::SetConstant<paddle::platform::GPUPlace, T> set_zero;
set_zero(ctx.device_context(), input_grad, static_cast<T>(0));
// Because beta is zero, it is unnecessary to reset input_grad.
PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
......
......@@ -30,7 +30,7 @@ void sgdUpdateCpu(real learningRate,
const real* grad,
real* momentumVec) {
decayRate *= learningRate;
#ifdef PADDLE_USE_MKLDNN
#ifdef PADDLE_USE_MKLML
#pragma omp parallel for
#endif
for (size_t i = 0; i < size; ++i) {
......
......@@ -63,9 +63,10 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
} \
} while (false)
enum class DataLayout {
enum class DataLayout { // Not use
kNHWC,
kNCHW,
kNCDHW,
kNCHW_VECT_C,
};
......@@ -107,12 +108,15 @@ class CudnnDataType<double> {
}
};
inline cudnnTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) {
inline cudnnTensorFormat_t GetCudnnTensorFormat(
const DataLayout& order) { // Not use
switch (order) {
case DataLayout::kNHWC:
return CUDNN_TENSOR_NHWC;
case DataLayout::kNCHW:
return CUDNN_TENSOR_NCHW;
case DataLayout::kNCDHW:
return CUDNN_TENSOR_NCHW; // TODO(chengduoZH) : add CUDNN_TENSOR_NCDHW
default:
PADDLE_THROW("Unknown cudnn equivalent for order");
}
......@@ -139,7 +143,7 @@ class ScopedTensorDescriptor {
strides[i] = dims[i + 1] * strides[i + 1];
}
// Update tensor descriptor dims setting if groups > 1
// FIXME(typhoonzero): Assume using NCHW order
// FIXME(typhoonzero): Assume using NCHW or NCDHW order
std::vector<int> dims_with_group(dims.begin(), dims.end()); // copy
if (groups > 1) {
dims_with_group[1] = dims_with_group[1] / groups;
......@@ -176,9 +180,10 @@ class ScopedFilterDescriptor {
const cudnnDataType_t type,
const std::vector<int>& kernel,
const int groups = 1) {
// filter layout: MCHW, where M is the number of
// filter layout: MCHW(MCDHW), where M is the number of
// output image channels, C is the number of input image channels,
// H and W is height and width of filter.
// D is the depth of the filter, H is the height of the filter, and W is the
// width of the filter.
std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
if (groups > 1) {
// M /= groups
......
......@@ -57,8 +57,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
| `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
| `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
| `WITH_TESTING` | ON | Build unit tests binaries. |
| `WITH_MKLDNN` | ON | Build with [Intel® MKL DNN](https://github.com/01org/mkl-dnn) support. |
| `WITH_MKLML` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) support. |
| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
| `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
| `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
| `WITH_C_API` | OFF | Build capi libraries for inference. |
......
......@@ -34,8 +34,7 @@ function cmake_gen() {
${PYTHON_FLAGS}
-DWITH_DOC=OFF
-DWITH_GPU=${WITH_GPU:-OFF}
-DWITH_MKLDNN=${WITH_MKLDNN:-ON}
-DWITH_MKLML=${WITH_MKLML:-ON}
-DWITH_MKL=${WITH_MKL:-ON}
-DWITH_AVX=${WITH_AVX:-OFF}
-DWITH_GOLANG=${WITH_GOLANG:-ON}
-DWITH_SWIG_PY=ON
......@@ -56,8 +55,7 @@ EOF
${PYTHON_FLAGS} \
-DWITH_DOC=OFF \
-DWITH_GPU=${WITH_GPU:-OFF} \
-DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
-DWITH_MKLML=${WITH_MKLML:-ON} \
-DWITH_MKL=${WITH_MKL:-ON} \
-DWITH_AVX=${WITH_AVX:-OFF} \
-DWITH_GOLANG=${WITH_GOLANG:-ON} \
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
......
......@@ -18,8 +18,8 @@ function version(){
echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
echo " with_avx: @WITH_AVX@"
echo " with_gpu: @WITH_GPU@"
echo " with_mkl: @WITH_MKL@"
echo " with_mkldnn: @WITH_MKLDNN@"
echo " with_mklml: @WITH_MKLML@"
echo " with_double: @WITH_DOUBLE@"
echo " with_python: @WITH_PYTHON@"
echo " with_rdma: @WITH_RDMA@"
......@@ -45,8 +45,8 @@ function ver2num() {
function cpu_config() {
# auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
# only when MKLDNN or MKLML enabled
if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then
# only when MKL enabled
if [ "@WITH_MKL@" == "OFF" ]; then
return 0
fi
ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
......@@ -70,8 +70,8 @@ function cpu_config() {
function threads_config() {
# auto set OMP_NUM_THREADS and MKL_NUM_THREADS
# according to trainer_count and total processors
# only when MKLDNN or MKLML enabled
if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then
# only when MKL enabled
if [ "@WITH_MKL@" == "OFF" ]; then
return 0
fi
processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
......
......@@ -6,7 +6,7 @@ mkdir -p $TRAVIS_BUILD_DIR/build
cd $TRAVIS_BUILD_DIR/build
# Compile Documentation only.
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
make -j `nproc` gen_proto_py
make -j `nproc` paddle_docs paddle_docs_cn
......
......@@ -137,6 +137,10 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
}
}
if (FLAGS_use_mkldnn) {
CHECK_EQ(FLAGS_trainer_count, 1UL) << "MKLDNN only need 1 trainer";
}
if (testing) {
LOG(INFO) << "trainer: in testing mode";
if (config_->getOptConfig().use_sparse_remote_updater() ||
......
......@@ -28,35 +28,7 @@ if(WITH_PYTHON)
${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif()
################ test_CompareTwoNets ######################
add_unittest_without_exec(test_CompareTwoNets
test_CompareTwoNets.cpp)
add_test(NAME test_CompareTwoNets
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
--config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
############### test_CompareTwoOpts ###################
add_unittest_without_exec(test_CompareTwoOpts
test_CompareTwoOpts.cpp)
add_test(NAME test_CompareTwoOpts
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts
--config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf
--num_passes=1 --need_high_accuracy=0
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
################# test_CompareSparse ##################
add_unittest_without_exec(test_CompareSparse
test_CompareSparse.cpp)
if(NOT ON_TRAVIS)
add_test(NAME test_CompareSparse
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
./.set_port.sh -p port -n 6
${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif()
################# test_recurrent_machine_generation ###############
add_unittest_without_exec(test_recurrent_machine_generation
test_recurrent_machine_generation.cpp)
......
./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
#edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
# Note: when making change to this file, please make sure
# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
# for comparing these two nets can pass (test_CompareTwoNets)
default_initial_std(0.1)
default_device(0)
word_dim = 999
l1 = 0
l2 = 0
model_type("nn")
sparse_update = get_config_arg("sparse_update", bool, False)
TrainData(ProtoData(
type = "proto_sequence",
files = ('trainer/tests/train_sparse.list'),
))
Settings(
algorithm='sgd',
batch_size=100,
learning_rate=0.0001,
learning_rate_decay_a=4e-08,
learning_rate_decay_b=0.0,
learning_rate_schedule='poly',
)
wordvec_dim = 32
layer2_dim = 16
layer3_dim = 16
hidden_dim = 32
slot_names = ["qb", "qw", "tb", "tw"]
def ltr_network(network_name,
word_dim=word_dim,
wordvec_dim=wordvec_dim,
layer2_dim=layer2_dim,
layer3_dim=layer3_dim,
hidden_dim=hidden_dim,
slot_names=slot_names,
l1=l1,
l2=l2):
slotnum = len(slot_names)
for i in xrange(slotnum):
Inputs(slot_names[i] + network_name)
for i in xrange(slotnum):
Layer(
name = slot_names[i] + network_name,
type = "data",
size = word_dim,
device = -1,
)
Layer(
name = slot_names[i] + "_embedding_" + network_name,
type = "mixed",
size = wordvec_dim,
bias = False,
device = -1,
inputs = TableProjection(slot_names[i] + network_name,
parameter_name = "embedding.w0",
decay_rate_l1=l1,
sparse_remote_update = True,
sparse_update = sparse_update,
),
)
Layer(
name = slot_names[i] + "_rnn1_" + network_name,
type = "recurrent",
active_type = "tanh",
bias = Bias(initial_std = 0,
parameter_name = "rnn1.bias"),
inputs = Input(slot_names[i] + "_embedding_" + network_name,
parameter_name = "rnn1.w0")
)
Layer(
name = slot_names[i] + "_rnnlast_" + network_name,
type = "seqlastins",
inputs = [
slot_names[i] + "_rnn1_" + network_name,
],
)
Layer(
name = "layer2_" + network_name,
type = "fc",
active_type = "tanh",
size = layer2_dim,
bias = Bias(parameter_name = "layer2.bias"),
inputs = [Input(slot_name + "_rnnlast_" + network_name,
parameter_name = "_layer2_" + slot_name + ".w",
decay_rate = l2,
initial_smart = True) for slot_name in slot_names]
)
Layer(
name = "layer3_" + network_name,
type = "fc",
active_type = "tanh",
size = layer3_dim,
bias = Bias(parameter_name = "layer3.bias"),
inputs = [
Input("layer2_" + network_name,
parameter_name = "_layer3.w",
decay_rate = l2,
initial_smart = True),
]
)
Layer(
name = "output_" + network_name,
type = "fc",
size = 1,
bias = False,
inputs = [
Input("layer3_" + network_name,
parameter_name = "_layerO.w"),
],
)
ltr_network("left")
ltr_network("right")
Inputs("label")
Layer(
name = "label",
type = "data",
size = 1,
)
Outputs("cost", "qb_rnnlast_left")
Layer(
name = "cost",
type = "rank-cost",
inputs = ["output_left", "output_right", "label"],
)
#edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
# Note: when making change to this file, please make sure
# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
# for comparing these two nets can pass (test_CompareTwoNets)
default_initial_std(0.1)
default_device(0)
word_dim = 1451594
l1 = 0
l2 = 0
model_type("nn")
sparse_update = get_config_arg("sparse_update", bool, False)
TrainData(ProtoData(
type = "proto_sequence",
files = ('trainer/tests/train.list'),
))
Settings(
algorithm='sgd',
batch_size=100,
learning_rate=0.0001,
learning_rate_decay_a=4e-08,
learning_rate_decay_b=0.0,
learning_rate_schedule='poly',
)
wordvec_dim = 128
layer2_dim = 96
layer3_dim = 96
hidden_dim = 128
slot_names = ["qb", "qw", "tb", "tw"]
def ltr_network(network_name,
word_dim=word_dim,
wordvec_dim=wordvec_dim,
layer2_dim=layer2_dim,
layer3_dim=layer3_dim,
hidden_dim=hidden_dim,
slot_names=slot_names,
l1=l1,
l2=l2):
slotnum = len(slot_names)
for i in xrange(slotnum):
Inputs(slot_names[i] + network_name)
for i in xrange(slotnum):
Layer(
name = slot_names[i] + network_name,
type = "data",
size = word_dim,
device = -1,
)
Layer(
name = slot_names[i] + "_embedding_" + network_name,
type = "mixed",
size = wordvec_dim,
bias = False,
device = -1,
inputs = TableProjection(slot_names[i] + network_name,
parameter_name = "embedding.w0",
decay_rate_l1=l1,
sparse_remote_update = True,
sparse_update = sparse_update,
),
)
Layer(
name = slot_names[i] + "_rnn1_" + network_name,
type = "recurrent",
active_type = "tanh",
bias = Bias(initial_std = 0,
parameter_name = "rnn1.bias"),
inputs = Input(slot_names[i] + "_embedding_" + network_name,
parameter_name = "rnn1.w0")
)
Layer(
name = slot_names[i] + "_rnnlast_" + network_name,
type = "seqlastins",
inputs = [
slot_names[i] + "_rnn1_" + network_name,
],
)
Layer(
name = "layer2_" + network_name,
type = "fc",
active_type = "tanh",
size = layer2_dim,
bias = Bias(parameter_name = "layer2.bias"),
inputs = [Input(slot_name + "_rnnlast_" + network_name,
parameter_name = "_layer2_" + slot_name + ".w",
decay_rate = l2,
initial_smart = True) for slot_name in slot_names]
)
Layer(
name = "layer3_" + network_name,
type = "fc",
active_type = "tanh",
size = layer3_dim,
bias = Bias(parameter_name = "layer3.bias"),
inputs = [
Input("layer2_" + network_name,
parameter_name = "_layer3.w",
decay_rate = l2,
initial_smart = True),
]
)
Layer(
name = "output_" + network_name,
type = "fc",
size = 1,
bias = False,
inputs = [
Input("layer3_" + network_name,
parameter_name = "_layerO.w"),
],
)
ltr_network("left")
ltr_network("right")
Inputs("label")
Layer(
name = "label",
type = "data",
size = 1,
)
Outputs("cost", "qb_rnnlast_left")
Layer(
name = "cost",
type = "rank-cost",
inputs = ["output_left", "output_right", "label"],
)
#edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
# Note: when making change to this file, please make sure
# sample_trainer_config_qb_rnn.conf is changed accordingly so that the uniitest
# for comparing these two nets can pass (test_CompareTwoNets)
default_initial_std(0.1)
default_device(0)
word_dim = 1451594
l1 = 0
l2 = 0
model_type("recurrent_nn")
sparse_update = get_config_arg("sparse_update", bool, False)
TrainData(ProtoData(
type = "proto_sequence",
files = ('trainer/tests/train.list'),
))
Settings(
algorithm='sgd',
batch_size=100,
learning_rate=0.0001,
learning_rate_decay_a=4e-08,
learning_rate_decay_b=0.0,
learning_rate_schedule='poly',
)
wordvec_dim = 128
layer2_dim = 96
layer3_dim = 96
hidden_dim = 128
slot_names = ["qb", "qw", "tb", "tw"]
def SimpleRecurrentLayer(name,
size,
active_type,
bias,
input_layer_name,
parameter_name,
seq_reversed = False):
RecurrentLayerGroupBegin(name + "_layer_group",
in_links=[input_layer_name],
out_links=[name],
seq_reversed=seq_reversed)
memory_name = Memory(name=name, size=size)
Layer(
name = name,
type = "mixed",
size = size,
active_type = active_type,
bias = bias,
inputs = [IdentityProjection(input_layer_name),
FullMatrixProjection(memory_name,
parameter_name = parameter_name,
),
]
)
RecurrentLayerGroupEnd(name + "_layer_group")
def ltr_network(network_name,
word_dim=word_dim,
wordvec_dim=wordvec_dim,
layer2_dim=layer2_dim,
layer3_dim=layer3_dim,
hidden_dim=hidden_dim,
slot_names=slot_names,
l1=l1,
l2=l2):
slotnum = len(slot_names)
for i in xrange(slotnum):
Inputs(slot_names[i] + network_name)
for i in xrange(slotnum):
Layer(
name = slot_names[i] + network_name,
type = "data",
size = word_dim,
device = -1,
)
Layer(
name = slot_names[i] + "_embedding_" + network_name,
type = "mixed",
size = wordvec_dim,
bias = False,
device = -1,
inputs = TableProjection(slot_names[i] + network_name,
parameter_name = "embedding.w0",
decay_rate_l1=l1,
sparse_remote_update = True,
sparse_update = sparse_update,
),
)
SimpleRecurrentLayer(
name = slot_names[i] + "_rnn1_" + network_name,
size = hidden_dim,
active_type = "tanh",
bias = Bias(initial_std = 0,
parameter_name = "rnn1.bias"),
input_layer_name = slot_names[i] + "_embedding_" + network_name,
parameter_name = "rnn1.w0",
)
Layer(
name = slot_names[i] + "_rnnlast_" + network_name,
type = "seqlastins",
inputs = [
slot_names[i] + "_rnn1_" + network_name,
],
)
Layer(
name = "layer2_" + network_name,
type = "fc",
active_type = "tanh",
size = layer2_dim,
bias = Bias(parameter_name = "layer2.bias"),
inputs = [Input(slot_name + "_rnnlast_" + network_name,
parameter_name = "_layer2_" + slot_name + ".w",
decay_rate = l2,
initial_smart = True) for slot_name in slot_names]
)
Layer(
name = "layer3_" + network_name,
type = "fc",
active_type = "tanh",
size = layer3_dim,
bias = Bias(parameter_name = "layer3.bias"),
inputs = [
Input("layer2_" + network_name,
parameter_name = "_layer3.w",
decay_rate = l2,
initial_smart = True),
]
)
Layer(
name = "output_" + network_name,
type = "fc",
size = 1,
bias = False,
inputs = [
Input("layer3_" + network_name,
parameter_name = "_layerO.w"),
],
)
ltr_network("left")
ltr_network("right")
Inputs("label")
Layer(
name = "label",
type = "data",
size = 1,
)
Outputs("cost", "qb_rnnlast_left")
Layer(
name = "cost",
type = "rank-cost",
inputs = ["output_left", "output_right", "label"],
)
......@@ -20,28 +20,6 @@ import random
import json
import string
@provider(slots=[
SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1),
IndexSlot(3)
])
def processNonSequenceData(obj, filename):
with open(filename, "rb") as f:
for line in f:
slots_str = line.split(';')
index = int(slots_str[0])
non_values = map(int, slots_str[1].split()[1:])
dense = map(float, slots_str[2].split()[1:])
strs = slots_str[4].strip().split(' ', 1)[1]
def __values_mapper__(s):
s = s.split(":")
return int(s[0]), float(s[1])
values = map(__values_mapper__, slots_str[3].split()[1:])
yield [non_values, dense, values, strs, index]
SPARSE_ID_LIMIT = 1000
SPARSE_ID_COUNT = 100
SEQUENCE_LIMIT = 50
......@@ -146,8 +124,6 @@ def processSubSeqAndGenerateData(obj, name):
if __name__ == "__main__":
pvd = processNonSequenceData("test.txt")
print pvd.getNextBatch(100)
pvd = processSeqAndGenerateData("_")
print pvd.getNextBatch(100)
pvd = processSubSeqAndGenerateData("_")
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <paddle/utils/PythonUtil.h>
#include <algorithm>
#include <cstdlib>
#include "paddle/trainer/Trainer.h"
using namespace paddle; // NOLINT
using namespace std; // NOLINT
DECLARE_int32(gpu_id);
DECLARE_bool(local);
DECLARE_bool(use_gpu);
DECLARE_string(config);
DECLARE_string(nics);
DEFINE_string(config_file_a, "", "config of one network to compare");
DEFINE_string(config_file_b, "", "config of another network to compare");
DEFINE_bool(need_high_accuracy,
true,
"whether need to run in double accuracy (recommended)");
DEFINE_double(
max_diff_ratio,
0.0f,
"max diff ratio allowed for outputs and parameters (value/gradient)");
struct ComData {
vector<Argument> outArgs;
vector<ParameterPtr> parameters;
};
void calcGradient(ComData& data, const string configFile) {
FLAGS_config = configFile;
FLAGS_local = true;
FLAGS_use_gpu = false;
FLAGS_nics = "";
*ThreadLocalRand::getSeed() = 0;
srand(0);
Trainer trainer;
trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
data.parameters = trainer.getGradientMachine()->getParameters();
trainer.getDataProvider()->setSkipShuffle();
trainer.train();
}
void checkBuffer(real* A,
const char* desA,
real* B,
const char* desB,
size_t len,
size_t width = 1) {
int nNum = 0;
for (size_t i = 0; i < len; ++i) {
real diff = fabs(A[i] - B[i]);
if (diff > 0.0f &&
diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_max_diff_ratio) {
nNum++;
LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
<< " " << desB << " : " << B[i];
}
}
EXPECT_EQ(0, nNum);
LOG(INFO) << "\n\n";
}
void compareGradient(ComData& comDataA, ComData& comDataB) {
vector<Argument> outArgsA = comDataA.outArgs;
vector<Argument> outArgsB = comDataB.outArgs;
for (size_t i = 0; i < outArgsA.size(); ++i) {
CpuMatrix matA(outArgsA[i].value->getHeight(),
outArgsA[i].value->getWidth());
CpuMatrix matB(outArgsB[i].value->getHeight(),
outArgsB[i].value->getWidth());
matA.copyFrom(*outArgsA[i].value);
matB.copyFrom(*outArgsB[i].value);
LOG(INFO) << "\n--------------------------------"
<< " Check Network Output_" << i << ":"
<< " -------------------------------------\n";
checkBuffer(matA.getData(),
"network A output",
matB.getData(),
"network B output",
matA.getElementCnt(),
matA.getWidth());
}
vector<ParameterPtr>& parametersA = comDataA.parameters;
vector<ParameterPtr>& parametersB = comDataB.parameters;
LOG(INFO) << "\n\n--------------------------------"
<< " Check Gradient Machine Parameters:"
<< " -------------------------------------\n";
for (size_t i = 0; i < parametersA.size(); ++i) {
ParameterPtr parameterA, parameterB;
parameterA = parametersA[i];
parameterB = parametersB[i];
CpuVector paraA(parameterA->getSize());
CpuVector paraB(parameterB->getSize());
paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
LOG(INFO) << "\n\n----------- PARAMETER_VALUE: " << parameterA->getName()
<< " ; size : " << paraA.getSize() << " ------------";
checkBuffer(paraA.getData(),
"Network A",
paraB.getData(),
"Network B",
paraA.getSize());
CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
<< " ; size : " << gradA.getSize() << " -----------";
checkBuffer(gradA.getData(),
"Network A",
gradB.getData(),
"Network B",
gradA.getSize());
}
}
TEST(Trainer, create) {
ComData dataA;
calcGradient(dataA, FLAGS_config_file_a);
LOG(INFO) << "\n\ntraining of Network A is finished\n\n";
ComData dataB;
calcGradient(dataB, FLAGS_config_file_b);
LOG(INFO) << "\n\ntraining of the Network B is finished\n\n";
compareGradient(dataA, dataB);
}
int main(int argc, char** argv) {
paddle::initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
initPython(argc, argv);
#ifndef PADDLE_TYPE_DOUBLE
if (FLAGS_need_high_accuracy) {
LOG(INFO) << "skip test due to it's need high accuracy";
return 0;
}
if (FLAGS_max_diff_ratio == 0.0f) {
FLAGS_max_diff_ratio = 2e-4;
LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
<< " in low accuracy mode";
}
#else
if (FLAGS_max_diff_ratio == 0.0f) {
FLAGS_max_diff_ratio = 2e-7;
LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
<< " in high accuracy mode";
}
#endif
int ret = RUN_ALL_TESTS();
return ret;
}
......@@ -25,45 +25,9 @@ limitations under the License. */
#include <unordered_set>
#include "picojson.h"
void checkEqual(const paddle::Argument& expect, const paddle::Argument& actual);
void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
const std::string kDir = "./trainer/tests/pydata_provider_wrapper_dir/";
TEST(PyDataProviderWrapper, NoSequenceData) {
paddle::DataConfig conf;
conf.set_type("py");
conf.set_load_data_module(std::string("testPyDataWrapper"));
conf.set_load_data_object(std::string("processNonSequenceData"));
conf.set_async_load_data(false);
conf.clear_files();
conf.set_files(kDir + "test_pydata_provider_wrapper.list");
paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
provider->setSkipShuffle();
provider->reset();
paddle::DataBatch batchFromPy;
provider->getNextBatch(100, &batchFromPy);
paddle::DataConfig conf2;
conf2.set_type("proto");
conf2.set_async_load_data(false);
conf2.clear_files();
conf2.set_files(kDir + "test_pydata_provider_wrapper.protolist");
provider.reset(paddle::DataProvider::create(conf2, false));
provider->setSkipShuffle();
provider->reset();
paddle::DataBatch batchFromProto;
provider->getNextBatch(100, &batchFromProto);
std::vector<paddle::Argument>& pyArguments = batchFromPy.getStreams();
std::vector<paddle::Argument>& protoArguments = batchFromProto.getStreams();
EXPECT_EQ(pyArguments.size(), protoArguments.size());
for (size_t i = 0; i < pyArguments.size(); ++i) {
checkEqual(protoArguments[i], pyArguments[i]);
}
}
TEST(PyDataProviderWrapper, SequenceData) {
paddle::DataConfig conf;
conf.set_type("py");
......@@ -148,66 +112,6 @@ int main(int argc, char** argv) {
return RUN_ALL_TESTS();
}
void checkEqual(const paddle::Argument& expect,
const paddle::Argument& actual) {
if (expect.value) {
EXPECT_TRUE(actual.value != nullptr);
paddle::Matrix* e = expect.value.get();
paddle::Matrix* a = actual.value.get();
EXPECT_EQ(e->getWidth(), a->getWidth());
EXPECT_EQ(e->getHeight(), a->getHeight());
if (dynamic_cast<paddle::CpuSparseMatrix*>(e)) {
paddle::CpuSparseMatrix* se = dynamic_cast<paddle::CpuSparseMatrix*>(e);
paddle::CpuSparseMatrix* sa = dynamic_cast<paddle::CpuSparseMatrix*>(a);
EXPECT_EQ(se->getFormat(), sa->getFormat());
EXPECT_EQ(se->getElementCnt(), sa->getElementCnt());
size_t rowSize = se->getFormat() == paddle::SPARSE_CSC
? se->getElementCnt()
: se->getHeight() + 1;
size_t colSize = se->getFormat() == paddle::SPARSE_CSC
? se->getWidth() + 1
: se->getElementCnt();
for (size_t i = 0; i < rowSize; ++i) {
EXPECT_EQ(se->getRows()[i], sa->getRows()[i]);
}
for (size_t i = 0; i < colSize; ++i) {
EXPECT_EQ(se->getCols()[i], sa->getCols()[i]);
}
if (se->getValueType() == paddle::FLOAT_VALUE) {
EXPECT_EQ(paddle::FLOAT_VALUE, sa->getValueType());
for (size_t i = 0; i < se->getElementCnt(); ++i) {
EXPECT_EQ(se->getValue()[i], sa->getValue()[i]);
}
}
} else if (dynamic_cast<paddle::CpuMatrix*>(e)) {
EXPECT_EQ(e->getElementCnt(), a->getElementCnt());
for (size_t i = 0; i < e->getElementCnt(); ++i) {
EXPECT_EQ(e->getData()[i], a->getData()[i]);
}
}
}
if (expect.ids) {
EXPECT_TRUE(actual.ids != nullptr);
paddle::VectorT<int>* e = expect.ids.get();
paddle::VectorT<int>* a = actual.ids.get();
EXPECT_EQ(e->getSize(), a->getSize());
for (size_t i = 0; i < e->getSize(); ++i) {
EXPECT_EQ(e->getData()[i], a->getData()[i]);
}
}
if (expect.strs) {
EXPECT_TRUE(actual.strs != nullptr);
std::vector<std::string>* e = expect.strs.get();
std::vector<std::string>* a = actual.strs.get();
EXPECT_EQ(e->size(), a->size());
for (size_t i = 0; i < e->size(); ++i) {
EXPECT_EQ((*e)[i], (*a)[i]);
}
}
}
void checkValue(std::vector<paddle::Argument>& arguments,
picojson::array& arr) {
// CHECK SLOT 0, Sparse Value.
......
......@@ -1826,7 +1826,7 @@ class FCLayer(LayerBase):
self.layer_type = 'mkldnn_fc'
config_assert(
len(inputs) == 1,
"MkldnnFCLayer support one and only one input!")
"MKLDNNFCLayer support one and only one input!")
super(FCLayer, self).__init__(
name, self.layer_type, size, inputs=inputs, **xargs)
for input_index in xrange(len(self.inputs)):
......@@ -1837,7 +1837,7 @@ class FCLayer(LayerBase):
sparse = format == "csr" or format == "csc"
if use_mkldnn:
config_assert(not sparse,
"MkldnnFCLayer do not support sparse format yet")
"MKLDNNFCLayer do not support sparse format yet")
if use_mkldnn_wgt:
dims = [self.config.size, input_layer.size]
if sparse:
......@@ -1853,7 +1853,7 @@ class FCLayer(LayerBase):
@config_layer('mkldnn_fc')
class MkldnnFcLayer(FCLayer):
class MKLDNNFcLayer(FCLayer):
layer_type = 'mkldnn_fc'
......@@ -3209,6 +3209,18 @@ class SubNestedSequenceLayer(LayerBase):
self.set_layer_size(size)
@config_layer('dot_prod')
class DotProdLayer(LayerBase):
def __init__(self, name, inputs, device=None):
super(DotProdLayer, self).__init__(
name, 'dot_prod', 0, inputs, device=device)
config_assert(len(inputs) == 2, 'DotProdLayer must have 2 inputs.')
config_assert(
self.get_input_layer(0).size == self.get_input_layer(1).size,
"Two inputs should have the same size.")
self.set_layer_size(1)
@config_layer('out_prod')
class OuterProdLayer(LayerBase):
def __init__(self, name, inputs, device=None):
......@@ -3506,11 +3518,17 @@ def ExpressionLayer(name, inputs, **xargs):
@config_layer('concat')
class ConcatenateLayer(LayerBase):
layer_type = 'concat'
def __init__(self, name, inputs, bias=False, **xargs):
config_assert(inputs, 'inputs cannot be empty')
config_assert(not bias, 'ConcatenateLayer cannot support bias.')
use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
if self.layer_type == "mkldnn_concat":
config_assert(use_mkldnn, "mkldnn_concat only support MKLDNN")
self.layer_type = 'mkldnn_concat' if use_mkldnn else 'concat'
super(ConcatenateLayer, self).__init__(
name, 'concat', 0, inputs=inputs, **xargs)
name, self.layer_type, 0, inputs=inputs, **xargs)
size = 0
for input_index in xrange(len(self.inputs)):
assert self.get_input_layer(0).height == self.get_input_layer(
......@@ -3530,6 +3548,11 @@ class ConcatenateLayer(LayerBase):
self.set_layer_size(size)
@config_layer('mkldnn_concat')
class MKLDNNConcatLayer(ConcatenateLayer):
layer_type = 'mkldnn_concat'
# like concat layer, but each input layer was processed by a Projection.
@config_layer('concat2')
class ConcatenateLayer2(LayerBase):
......
......@@ -115,6 +115,7 @@ __all__ = [
'huber_classification_cost',
'block_expand_layer',
'maxout_layer',
'dot_prod_layer',
'out_prod_layer',
'printer_layer',
'print_layer',
......@@ -198,6 +199,7 @@ class LayerType(object):
SCALING_LAYER = 'scaling'
TRANS_LAYER = 'trans'
ROTATE_LAYER = 'rotate'
DOT_PROD_LAYER = 'dot_prod'
OUT_PROD_LAYER = 'out_prod'
FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
......@@ -4143,6 +4145,45 @@ def maxid_layer(input, name=None, layer_attr=None):
size=l.config.size)
@wrap_name_default()
def dot_prod_layer(input1, input2, name=None, layer_attr=None):
"""
A layer for computing the dot product of two vectors.
The example usage is:
.. code-block:: python
dot_prod = dot_prod_layer(input1=vec1, input2=vec2)
:param name: The name of this layer. It is optional.
:type name: basestring
:param input1: The first input layer.
:type input: LayerOutput
:param input2: The second input layer.
:type input2: LayerOutput
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute.
:return: LayerOutput object.
:rtype: LayerOutput
"""
assert isinstance(input1, LayerOutput)
assert isinstance(input2, LayerOutput)
assert input1.size == input2.size, ("Two inputs should have the same size.")
l = Layer(
name=name,
type=LayerType.DOT_PROD_LAYER,
inputs=[input1.name, input2.name],
**ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput(
name=name,
layer_type=LayerType.DOT_PROD_LAYER,
parents=[input1, input2],
size=l.config.size)
@wrap_name_default()
def out_prod_layer(input1, input2, name=None, layer_attr=None):
"""
......
......@@ -11,6 +11,6 @@ test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_l
test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_scale_sub_region_layer
test_factorization_machine)
test_dot_prod_layer test_factorization_machine)
export whole_configs=(test_split_datasource)
type: "nn"
layers {
name: "vector1"
type: "data"
size: 10
active_type: ""
}
layers {
name: "vector2"
type: "data"
size: 10
active_type: ""
}
layers {
name: "__dot_prod_layer_0__"
type: "dot_prod"
size: 1
active_type: ""
inputs {
input_layer_name: "vector1"
}
inputs {
input_layer_name: "vector2"
}
}
input_layer_names: "vector1"
input_layer_names: "vector2"
output_layer_names: "__dot_prod_layer_0__"
sub_models {
name: "root"
layer_names: "vector1"
layer_names: "vector2"
layer_names: "__dot_prod_layer_0__"
input_layer_names: "vector1"
input_layer_names: "vector2"
output_layer_names: "__dot_prod_layer_0__"
is_recurrent_layer_group: false
}
from paddle.trainer_config_helpers import *
vec1 = data_layer(name='vector1', size=10)
vec2 = data_layer(name='vector2', size=10)
dot_product = dot_prod_layer(input1=vec1, input2=vec2)
outputs(dot_product)
......@@ -4,7 +4,10 @@ import collections
import numpy as np
import copy
__all__ = ['Block', 'Variable', 'Program', 'Operator', 'default_startup_program', 'default_main_program']
__all__ = [
'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
'default_main_program'
]
def unique_name(prefix):
......@@ -232,17 +235,17 @@ class Operator(object):
in_proto.name)
if found:
in_argus = inputs[in_proto.name]
if not isinstance(in_argus, list):
in_argus = [in_argus]
if not in_proto.duplicable and len(in_argus) > 1:
in_args = inputs[in_proto.name]
if not isinstance(in_args, list):
in_args = [in_args]
if not in_proto.duplicable and len(in_args) > 1:
raise ValueError(
"Input %s expects only one input, but %d are given."
% (in_proto.name, len(in_argus)))
in_argu_names = []
for argu in in_argus:
in_argu_names.append(argu.name)
self.desc.set_input(in_proto.name, in_argu_names)
% (in_proto.name, len(in_args)))
in_arg_names = []
for arg in in_args:
in_arg_names.append(arg.name)
self.desc.set_input(in_proto.name, in_arg_names)
else:
self.desc.set_input(in_proto.name, [])
......@@ -260,18 +263,18 @@ class Operator(object):
str(e) for e in given)))
for out_proto in proto.outputs:
out_argus = outputs[out_proto.name]
if not isinstance(out_argus, list):
out_argus = [out_argus]
if not out_proto.duplicable and len(out_argus) > 1:
out_args = outputs[out_proto.name]
if not isinstance(out_args, list):
out_args = [out_args]
if not out_proto.duplicable and len(out_args) > 1:
raise ValueError(
"Output %s expects only one output, but %d are given." %
(out_proto.name, len(out_argus)))
out_argu_names = []
for argu in out_argus:
out_argu_names.append(argu.name)
argu.op = self
self.desc.set_output(out_proto.name, out_argu_names)
(out_proto.name, len(out_args)))
out_arg_names = []
for arg in out_args:
out_arg_names.append(arg.name)
arg.op = self
self.desc.set_output(out_proto.name, out_arg_names)
if attrs is not None:
if not isinstance(attrs, dict):
......@@ -582,8 +585,10 @@ class Parameter(Variable):
g_main_program = Program()
g_startup_program = Program()
def default_startup_program():
return g_startup_program
def default_main_program():
return g_main_program
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.core as core
import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.framework as framework
from paddle.v2.fluid.io import save_persistables, load_persistables
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.io import save_persistables, load_persistables
from paddle.v2.fluid.optimizer import SGDOptimizer
import numpy as np
x = layers.data(
name='x',
shape=[13],
data_type='float32')
x = layers.data(name='x', shape=[13], data_type='float32')
y_predict = layers.fc(input=x,
size=1,
act=None)
y_predict = layers.fc(input=x, size=1, act=None)
y = layers.data(
name='y',
shape=[1],
data_type='float32')
y = layers.data(name='y', shape=[1], data_type='float32')
cost = layers.square_error_cost(
input=y_predict,
label=y)
cost = layers.square_error_cost(input=y_predict, label=y)
avg_cost = layers.mean(x=cost)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
sgd_optimizer = SGDOptimizer(learning_rate=0.001)
opts = sgd_optimizer.minimize(avg_cost)
BATCH_SIZE = 20
......
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets
import paddle.v2.fluid.optimizer as optimizer
from paddle.v2.fluid.executor import Executor
import paddle.v2.fluid.framework as framework
from paddle.v2.fluid.initializer import XavierInitializer
from paddle.v2.fluid.optimizer import AdamOptimizer
def resnet_cifar10(input, depth=32):
def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu'):
def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
tmp = layers.conv2d(
input=input,
filter_size=filter_size,
......@@ -24,9 +19,7 @@ def resnet_cifar10(input, depth=32):
padding=padding,
act=None,
bias_attr=False)
return layers.batch_norm(
input=tmp,
act=act)
return layers.batch_norm(input=tmp, act=act)
def shortcut(input, ch_in, ch_out, stride, program, init_program):
if ch_in != ch_out:
......@@ -35,28 +28,11 @@ def resnet_cifar10(input, depth=32):
else:
return input
def basicblock(input,
ch_in,
ch_out,
stride):
tmp = conv_bn_layer(
input,
ch_out,
3,
stride,
1)
tmp = conv_bn_layer(
tmp,
ch_out,
3,
1,
1,
act=None)
def basicblock(input, ch_in, ch_out, stride):
tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
short = shortcut(input, ch_in, ch_out, stride)
return layers.elementwise_add(
x=tmp,
y=short,
act='relu')
return layers.elementwise_add(x=tmp, y=short, act='relu')
def layer_warp(block_func, input, ch_in, ch_out, count, stride):
tmp = block_func(input, ch_in, ch_out, stride)
......@@ -67,45 +43,17 @@ def resnet_cifar10(input, depth=32):
assert (depth - 2) % 6 == 0
n = (depth - 2) / 6
conv1 = conv_bn_layer(
input=input,
ch_out=16,
filter_size=3,
stride=1,
padding=1)
res1 = layer_warp(
basicblock,
conv1,
16,
16,
n,
1)
res2 = layer_warp(
basicblock,
res1,
16,
32,
n,
2)
res3 = layer_warp(
basicblock,
res2,
32,
64,
n,
2)
input=input, ch_out=16, filter_size=3, stride=1, padding=1)
res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
pool = layers.pool2d(
input=res3,
pool_size=8,
pool_type='avg',
pool_stride=1)
input=res3, pool_size=8, pool_type='avg', pool_stride=1)
return pool
def vgg16_bn_drop(input):
def conv_block(input,
num_filter,
groups,
dropouts):
def conv_block(input, num_filter, groups, dropouts):
return nets.img_conv_group(
input=input,
pool_size=2,
......@@ -123,22 +71,14 @@ def vgg16_bn_drop(input):
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = layers.dropout(
x=conv5,
dropout_prob=0.5)
drop = layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = layers.fc(input=drop,
size=512,
act=None,
param_attr={"initializer": XavierInitializer()})
reshape1 = layers.reshape(
x=fc1,
shape=list(fc1.shape + (1, 1)))
bn = layers.batch_norm(
input=reshape1,
act='relu')
drop2 = layers.dropout(
x=bn,
dropout_prob=0.5)
reshape1 = layers.reshape(x=fc1, shape=list(fc1.shape + (1, 1)))
bn = layers.batch_norm(input=reshape1, act='relu')
drop2 = layers.dropout(x=bn, dropout_prob=0.5)
fc2 = layers.fc(input=drop2,
size=512,
act=None,
......@@ -165,8 +105,8 @@ cost = layers.cross_entropy(input=predict, label=label)
avg_cost = layers.mean(x=cost)
accuracy = layers.accuracy(input=predict, label=label)
# optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
optimizer = optimizer.AdamOptimizer(learning_rate=0.001)
# optimizer = SGDOptimizer(learning_rate=0.001)
optimizer = AdamOptimizer(learning_rate=0.001)
opts = optimizer.minimize(avg_cost)
BATCH_SIZE = 128
......
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets
import paddle.v2.fluid.core as core
import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.evaluator as evaluator
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets
from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.optimizer import AdamOptimizer
import numpy as np
images = layers.data(
name='pixel',
shape=[1, 28, 28],
data_type='float32')
label = layers.data(
name='label',
shape=[1],
data_type='int64')
images = layers.data(name='pixel', shape=[1, 28, 28], data_type='float32')
label = layers.data(name='label', shape=[1], data_type='int64')
conv_pool_1 = nets.simple_img_conv_pool(
input=images,
filter_size=5,
......@@ -32,17 +25,13 @@ conv_pool_2 = nets.simple_img_conv_pool(
pool_stride=2,
act="relu")
predict = layers.fc(input=conv_pool_2,
size=10,
act="softmax")
predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
cost = layers.cross_entropy(input=predict, label=label)
avg_cost = layers.mean(x=cost)
optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
optimizer = AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
opts = optimizer.minimize(avg_cost)
accuracy, acc_out = evaluator.accuracy(
input=predict,
label=label)
accuracy, acc_out = evaluator.accuracy(input=predict, label=label)
BATCH_SIZE = 50
PASS_NUM = 3
......
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.core as core
import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.regularizer import L2DecayRegularizer
from paddle.v2.fluid.initializer import UniformInitializer
import numpy as np
from paddle.v2.fluid.optimizer import MomentumOptimizer
from paddle.v2.fluid.regularizer import L2DecayRegularizer
BATCH_SIZE = 128
image = layers.data(
name='x',
shape=[784],
data_type='float32')
image = layers.data(name='x', shape=[784], data_type='float32')
param_attr = {
'name': None,
......@@ -22,32 +18,21 @@ param_attr = {
'regularization': L2DecayRegularizer(0.0005 * BATCH_SIZE)
}
hidden1 = layers.fc(input=image,
size=128,
act='relu',
param_attr=param_attr)
hidden2 = layers.fc(input=hidden1,
size=64,
act='relu',
param_attr=param_attr)
hidden1 = layers.fc(input=image, size=128, act='relu', param_attr=param_attr)
hidden2 = layers.fc(input=hidden1, size=64, act='relu', param_attr=param_attr)
predict = layers.fc(input=hidden2,
size=10,
act='softmax',
param_attr=param_attr)
label = layers.data(
name='y',
shape=[1],
data_type='int64')
label = layers.data(name='y', shape=[1], data_type='int64')
cost = layers.cross_entropy(input=predict, label=label)
avg_cost = layers.mean(x=cost)
accuracy = layers.accuracy(
input=predict,
label=label)
accuracy = layers.accuracy(input=predict, label=label)
optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
opts = optimizer.minimize(avg_cost)
train_reader = paddle.batch(
......
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets
import paddle.v2.fluid.core as core
import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets
from paddle.v2.fluid.executor import Executor
import numpy as np
from paddle.v2.fluid.optimizer import SGDOptimizer
IS_SPARSE = True
USE_GPU = False
......@@ -19,10 +18,7 @@ def get_usr_combined_features():
USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
uid = layers.data(
name='user_id',
shape=[1],
data_type='int64')
uid = layers.data(name='user_id', shape=[1], data_type='int64')
usr_emb = layers.embedding(
input=uid,
......@@ -31,15 +27,11 @@ def get_usr_combined_features():
param_attr={'name': 'user_table'},
is_sparse=IS_SPARSE)
usr_fc = layers.fc(input=usr_emb,
size=32)
usr_fc = layers.fc(input=usr_emb, size=32)
USR_GENDER_DICT_SIZE = 2
usr_gender_id = layers.data(
name='gender_id',
shape=[1],
data_type='int64')
usr_gender_id = layers.data(name='gender_id', shape=[1], data_type='int64')
usr_gender_emb = layers.embedding(
input=usr_gender_id,
......@@ -47,14 +39,10 @@ def get_usr_combined_features():
param_attr={'name': 'gender_table'},
is_sparse=IS_SPARSE)
usr_gender_fc = layers.fc(input=usr_gender_emb,
size=16)
usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
usr_age_id = layers.data(
name='age_id',
shape=[1],
data_type="int64")
usr_age_id = layers.data(name='age_id', shape=[1], data_type="int64")
usr_age_emb = layers.embedding(
input=usr_age_id,
......@@ -62,14 +50,10 @@ def get_usr_combined_features():
is_sparse=IS_SPARSE,
param_attr={'name': 'age_table'})
usr_age_fc = layers.fc(input=usr_age_emb,
size=16)
usr_age_fc = layers.fc(input=usr_age_emb, size=16)
USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
usr_job_id = layers.data(
name='job_id',
shape=[1],
data_type="int64")
usr_job_id = layers.data(name='job_id', shape=[1], data_type="int64")
usr_job_emb = layers.embedding(
input=usr_job_id,
......@@ -77,16 +61,12 @@ def get_usr_combined_features():
param_attr={'name': 'job_table'},
is_sparse=IS_SPARSE)
usr_job_fc = layers.fc(input=usr_job_emb,
size=16)
usr_job_fc = layers.fc(input=usr_job_emb, size=16)
concat_embed = layers.concat(
input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
axis=1)
input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
usr_combined_features = layers.fc(input=concat_embed,
size=200,
act="tanh")
usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
return usr_combined_features
......@@ -95,10 +75,7 @@ def get_mov_combined_features():
MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
mov_id = layers.data(
name='movie_id',
shape=[1],
data_type='int64')
mov_id = layers.data(name='movie_id', shape=[1], data_type='int64')
mov_emb = layers.embedding(
input=mov_id,
......@@ -107,36 +84,24 @@ def get_mov_combined_features():
param_attr={'name': 'movie_table'},
is_sparse=IS_SPARSE)
mov_fc = layers.fc(input=mov_emb,
size=32)
mov_fc = layers.fc(input=mov_emb, size=32)
CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
category_id = layers.data(
name='category_id',
shape=[1],
data_type='int64')
category_id = layers.data(name='category_id', shape=[1], data_type='int64')
mov_categories_emb = layers.embedding(
input=category_id,
size=[CATEGORY_DICT_SIZE, 32],
is_sparse=IS_SPARSE)
input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
mov_categories_hidden = layers.sequence_pool(
input=mov_categories_emb,
pool_type="sum")
input=mov_categories_emb, pool_type="sum")
MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
mov_title_id = layers.data(
name='movie_title',
shape=[1],
data_type='int64')
mov_title_id = layers.data(name='movie_title', shape=[1], data_type='int64')
mov_title_emb = layers.embedding(
input=mov_title_id,
size=[MOV_TITLE_DICT_SIZE, 32],
is_sparse=IS_SPARSE)
input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
mov_title_conv = nets.sequence_conv_pool(
input=mov_title_emb,
......@@ -146,13 +111,10 @@ def get_mov_combined_features():
pool_type="sum")
concat_embed = layers.concat(
input=[mov_fc, mov_categories_hidden, mov_title_conv],
axis=1)
input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
# FIXME(dzh) : need tanh operator
mov_combined_features = layers.fc(input=concat_embed,
size=200,
act="tanh")
mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
return mov_combined_features
......@@ -162,18 +124,11 @@ def model():
mov_combined_features = get_mov_combined_features()
# need cos sim
inference = layers.cos_sim(
X=usr_combined_features,
Y=mov_combined_features)
inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
label = layers.data(
name='score',
shape=[1],
data_type='float32')
label = layers.data(name='score', shape=[1], data_type='float32')
square_cost = layers.square_error_cost(
input=inference,
label=label)
square_cost = layers.square_error_cost(input=inference, label=label)
avg_cost = layers.mean(x=square_cost)
......@@ -182,7 +137,7 @@ def model():
def main():
cost = model()
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.2)
sgd_optimizer = SGDOptimizer(learning_rate=0.2)
opts = sgd_optimizer.minimize(cost)
if USE_GPU:
......
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets
import paddle.v2.fluid.core as core
import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets
from paddle.v2.fluid.executor import Executor
import numpy as np
from paddle.v2.fluid.optimizer import AdamOptimizer
def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32):
......@@ -31,7 +30,7 @@ def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32):
act="softmax")
cost = layers.cross_entropy(input=prediction, label=label)
avg_cost = layers.mean(x=cost)
adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
adam_optimizer = AdamOptimizer(learning_rate=0.002)
opts = adam_optimizer.minimize(avg_cost)
acc = layers.accuracy(input=prediction, label=label)
return avg_cost, acc
......
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets
import paddle.v2.fluid.core as core
import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor
import numpy as np
from paddle.v2.fluid.optimizer import AdamOptimizer
def stacked_lstm_net(input_dim,
......@@ -41,7 +39,7 @@ def stacked_lstm_net(input_dim,
act='softmax')
cost = layers.cross_entropy(input=prediction, label=label)
avg_cost = layers.mean(x=cost)
adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
adam_optimizer = AdamOptimizer(learning_rate=0.002)
opts = adam_optimizer.minimize(avg_cost)
acc = layers.accuracy(input=prediction, label=label)
return avg_cost, acc
......
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.core as core
import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor
import numpy as np
from paddle.v2.fluid.optimizer import AdamOptimizer
def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
......@@ -33,7 +32,7 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
cost = layers.cross_entropy(input=prediction, label=label)
avg_cost = layers.mean(x=cost)
adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
adam_optimizer = AdamOptimizer(learning_rate=0.002)
opts = adam_optimizer.minimize(avg_cost)
acc = layers.accuracy(input=prediction, label=label)
......
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.core as core
import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor
import numpy as np
from paddle.v2.fluid.optimizer import SGDOptimizer
PASS_NUM = 100
EMBED_SIZE = 32
......@@ -17,26 +16,11 @@ IS_SPARSE = True
word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)
first_word = layers.data(
name='firstw',
shape=[1],
data_type='int64')
second_word = layers.data(
name='secondw',
shape=[1],
data_type='int64')
third_word = layers.data(
name='thirdw',
shape=[1],
data_type='int64')
forth_word = layers.data(
name='forthw',
shape=[1],
data_type='int64')
next_word = layers.data(
name='nextw',
shape=[1],
data_type='int64')
first_word = layers.data(name='firstw', shape=[1], data_type='int64')
second_word = layers.data(name='secondw', shape=[1], data_type='int64')
third_word = layers.data(name='thirdw', shape=[1], data_type='int64')
forth_word = layers.data(name='forthw', shape=[1], data_type='int64')
next_word = layers.data(name='nextw', shape=[1], data_type='int64')
embed_first = layers.embedding(
input=first_word,
......@@ -64,19 +48,12 @@ embed_forth = layers.embedding(
param_attr={'name': 'shared_w'})
concat_embed = layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth],
axis=1)
hidden1 = layers.fc(input=concat_embed,
size=HIDDEN_SIZE,
act='sigmoid')
predict_word = layers.fc(input=hidden1,
size=dict_size,
act='softmax')
cost = layers.cross_entropy(
input=predict_word,
label=next_word)
input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
hidden1 = layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
predict_word = layers.fc(input=hidden1, size=dict_size, act='softmax')
cost = layers.cross_entropy(input=predict_word, label=next_word)
avg_cost = layers.mean(x=cost)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
sgd_optimizer = SGDOptimizer(learning_rate=0.001)
opts = sgd_optimizer.minimize(avg_cost)
train_reader = paddle.batch(
......
......@@ -110,13 +110,30 @@ class TestConv2dOp(OpTest):
self.op_type = "conv2d"
class TestWithPad(TestConv2dOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3]
class TestWithStride(TestConv2dOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 6, 6] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3]
class TestWithGroup(TestConv2dOp):
def init_group(self):
self.groups = 3
def init_op_type(self):
self.op_type = "conv2d"
class TestWith1x1(TestConv2dOp):
def init_test_case(self):
......@@ -127,15 +144,9 @@ class TestWith1x1(TestConv2dOp):
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 1, 1]
def init_dilation(self):
self.dilations = [1, 1]
def init_group(self):
self.groups = 3
def init_op_type(self):
self.op_type = "conv2d"
class TestWithDilation(TestConv2dOp):
def init_test_case(self):
......@@ -152,14 +163,19 @@ class TestWithDilation(TestConv2dOp):
def init_group(self):
self.groups = 3
#----------------Conv2dCudnn----------------
class TestCudnn(TestConv2dOp):
def init_op_type(self):
self.op_type = "conv2d"
self.op_type = "conv_cudnn"
#----------------Conv2dCudnn----------------
class TestCudnnWithPad(TestWithPad):
def init_op_type(self):
self.op_type = "conv_cudnn"
class TestCudnn(TestConv2dOp):
class TestCudnnWithStride(TestWithStride):
def init_op_type(self):
self.op_type = "conv_cudnn"
......
......@@ -4,9 +4,7 @@ from op_test import OpTest
def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
# [2, 3, 5, 5]
in_n, in_c, in_h, in_w = input_.shape
# [3, 6, 3, 3]
f_c, out_c, f_h, f_w = filter_.shape
assert in_c == f_c
......@@ -29,6 +27,7 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
j1, j2 = j * stride[0], j * stride[0] + f_w
out[n, k, i1:i2, j1:j2] += tmp_out
out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
return out
......@@ -36,8 +35,6 @@ class TestConv2dTransposeOp(OpTest):
def setUp(self):
# init as conv transpose
self.init_op_type()
# [2, 3, 5, 5] -> kernel [3, 6, 3, 3] -> output [2, 6, 7, 7]
self.init_test_case()
conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad}
......@@ -55,7 +52,6 @@ class TestConv2dTransposeOp(OpTest):
self.outputs = {'Output': output}
def test_check_output(self):
print 'check output here for', self.op_type
self.check_output()
def test_check_grad_no_input(self):
......@@ -88,6 +84,26 @@ class TestConv2dTransposeOp(OpTest):
self.op_type = "conv2d_transpose"
class TestWithPad(TestConv2dTransposeOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [1, 1]
self.dilations = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
f_c = self.input_size[1]
self.filter_size = [f_c, 6, 3, 3]
class TestWithStride(TestConv2dTransposeOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [2, 2]
self.dilations = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
f_c = self.input_size[1]
self.filter_size = [f_c, 6, 3, 3]
# ------------ test_cudnn ------------
class TestCudnn(TestConv2dTransposeOp):
def init_op_type(self):
......
......@@ -4,9 +4,7 @@ from op_test import OpTest
def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
# [2, 3, 5, 5, 5]
in_n, in_c, in_d, in_h, in_w = input_.shape
# [3, 6, 3, 3, 3]
f_c, out_c, f_d, f_h, f_w = filter_.shape
assert in_c == f_c
......@@ -14,7 +12,6 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
out_d = (in_d - 1) * stride[0] + f_d
out_h = (in_h - 1) * stride[1] + f_h
out_w = (in_w - 1) * stride[2] + f_w
out = np.zeros((in_n, out_c, out_d, out_h, out_w))
for n in range(in_n):
......@@ -33,6 +30,8 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
j1, j2 = j * stride[2], j * stride[2] + f_w
out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out
out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
pad[2]]
return out
......@@ -40,8 +39,6 @@ class TestConv3dTransposeOp(OpTest):
def setUp(self):
# init as conv transpose
self.init_op_type()
# [2, 3, 5, 5, 5] -> kernel [3, 6, 3, 3, 3] -> output [2, 6, 7, 7, 7]
self.init_test_case()
conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad}
......@@ -49,7 +46,6 @@ class TestConv3dTransposeOp(OpTest):
filter_ = np.random.random(self.filter_size).astype("float32")
output = conv3dtranspose_forward_naive(
input_, filter_, conv3dtranspose_param).astype("float32")
# print 'deconv output py', output, output.shape
self.inputs = {'Input': input_, 'Filter': filter_}
self.attrs = {
......@@ -60,7 +56,6 @@ class TestConv3dTransposeOp(OpTest):
self.outputs = {'Output': output}
def test_check_output(self):
print 'check output here'
self.check_output()
def test_check_grad(self):
......@@ -85,7 +80,7 @@ class TestConv3dTransposeOp(OpTest):
self.pad = [0, 0, 0]
self.stride = [1, 1, 1]
self.dilations = [1, 1, 1]
self.input_size = [2, 3, 5, 5, 5] # NCHW
self.input_size = [2, 3, 5, 5, 5] # NCDHW
f_c = self.input_size[1]
self.filter_size = [f_c, 6, 3, 3, 3]
......@@ -93,5 +88,31 @@ class TestConv3dTransposeOp(OpTest):
self.op_type = "conv3d_transpose"
class TestWithPad(TestConv3dTransposeOp):
def init_test_case(self):
self.pad = [1, 1, 1]
self.stride = [1, 1, 1]
self.dilations = [1, 1, 1]
self.input_size = [2, 3, 5, 5, 5] # NCDHW
f_c = self.input_size[1]
self.filter_size = [f_c, 6, 3, 3, 3]
class TestWithStride(TestConv3dTransposeOp):
def init_test_case(self):
self.pad = [1, 1, 1]
self.stride = [2, 2, 2]
self.dilations = [1, 1, 1]
self.input_size = [2, 3, 5, 5, 5] # NCDHW
f_c = self.input_size[1]
self.filter_size = [f_c, 6, 3, 3, 3]
# ------------ test_cudnn ------------
class TestCudnn(TestConv3dTransposeOp):
def init_op_type(self):
self.op_type = "conv3d_transpose_cudnn"
if __name__ == '__main__':
unittest.main()
......@@ -6,7 +6,8 @@ from test_lstm_op import identity, sigmoid, tanh, relu
class TestGRUOp(OpTest):
batch_size = 9
lod = [[0, 2, 6, 9]]
batch_size = lod[0][-1]
frame_size = 5
activate = {
'identity': identity,
......@@ -35,7 +36,7 @@ class TestGRUOp(OpTest):
seq_starts[sorted_seqs[i]] + batch_idx)
idx_in_seq.append(idx)
idx_in_seq_list.append(idx_in_seq)
return idx_in_seq_list
return idx_in_seq_list, sorted_seqs
def gru_step(self, x, h_p, w, b):
batch_size = x.shape[0]
......@@ -66,8 +67,8 @@ class TestGRUOp(OpTest):
batch_hidden = self.outputs['BatchHidden']
hidden = self.outputs['Hidden']
idx_in_seq_list = self.idx_in_seq_list
h_p = self.inputs['H0'] if self.inputs.has_key('H0') else np.zeros(
(len(idx_in_seq_list[0]), self.frame_size))
h_p = self.inputs['H0'][self.sorted_seqs] if self.inputs.has_key(
'H0') else np.zeros((len(idx_in_seq_list[0]), self.frame_size))
num_batch = len(idx_in_seq_list)
end_idx = 0
for batch_idx in range(num_batch):
......@@ -84,8 +85,9 @@ class TestGRUOp(OpTest):
return batch_gate, batch_reset_hidden_prev, hidden
def set_data(self):
lod = [[0, 2, 6, self.batch_size]]
self.idx_in_seq_list = self.seq_to_batch(lod, self.is_reverse)
lod = self.lod
self.idx_in_seq_list, self.sorted_seqs = self.seq_to_batch(
lod, self.is_reverse)
batch_size = self.batch_size
frame_size = self.frame_size
input = np.random.rand(batch_size, frame_size * 3).astype('float64')
......@@ -146,7 +148,7 @@ class TestGRUOpReverse(TestGRUOp):
def set_confs(self):
self.is_reverse = True
self.attrs = {
'activation': 'identity',
'activation': 'tanh',
'gate_activation': 'sigmoid',
'is_reverse': self.is_reverse
}
......
import unittest
import numpy as np
from paddle.v2.framework.op import Operator
import paddle.v2.framework.core as core
from paddle.v2.fluid.op import Operator
import paddle.v2.fluid.core as core
def create_tensor(scope, name, np_data):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册