diff --git a/CMakeLists.txt b/CMakeLists.txt index fd3582a1bca199d62d19550ffdd1efe9db520fa7..ae8728f4d4c22f45f13a283a448e907337f37f7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,8 +36,7 @@ include(simd) ################################ Configurations ####################################### option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) -option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND}) -option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND}) +option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) @@ -82,10 +81,8 @@ if(ANDROID OR IOS) "Disable PYTHON when cross-compiling for Android and iOS" FORCE) set(WITH_RDMA OFF CACHE STRING "Disable RDMA when cross-compiling for Android and iOS" FORCE) - set(WITH_MKLDNN OFF CACHE STRING - "Disable MKLDNN when cross-compiling for Android and iOS" FORCE) - set(WITH_MKLML OFF CACHE STRING - "Disable MKLML package when cross-compiling for Android and iOS" FORCE) + set(WITH_MKL OFF CACHE STRING + "Disable MKL when cross-compiling for Android and iOS" FORCE) # Compile PaddlePaddle mobile inference library if (NOT WITH_C_API) @@ -111,6 +108,14 @@ else() set(THIRD_PARTY_BUILD_TYPE Release) endif() +set(WITH_MKLML ${WITH_MKL}) +if (WITH_MKL AND ${AVX2_FOUND}) + set(WITH_MKLDNN ON) +else() + message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN") + set(WITH_MKLDNN OFF) +endif() + ######################################################################################## include(external/mklml) # download mklml package @@ -158,14 +163,15 @@ set(EXTERNAL_LIBS ) if(WITH_GPU) - list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) - if(NOT WITH_DSO) - list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) - endif(NOT WITH_DSO) + include(cuda) endif(WITH_GPU) +if(WITH_MKLML) + list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB}) +endif() + if(WITH_MKLDNN) - list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB}) + list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB}) endif() if(USE_NNPACK) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 24ddb24399dabeec9b8e5faf36be3eb21f420111..e550ec285668ea25757eeee9e7c5dc48fc9d339d 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -76,27 +76,14 @@ else() include_directories(${CUDA_TOOLKIT_INCLUDE}) endif(NOT WITH_GPU) -if(WITH_MKLDNN) - add_definitions(-DPADDLE_USE_MKLDNN) - if (WITH_MKLML AND MKLDNN_IOMP_DIR) - message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}") - set(OPENMP_FLAGS "-fopenmp") - set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) - set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") - else() - find_package(OpenMP) - if(OPENMP_FOUND) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - else() - message(WARNING "Can not find OpenMP." - "Some performance features in MKLDNN may not be available") - endif() - endif() - -endif(WITH_MKLDNN) +if (WITH_MKLML AND MKLML_IOMP_LIB) + message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") + set(OPENMP_FLAGS "-fopenmp") + set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") +endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake index 310450f7d009dc0cdae9c0079a96445af8ec8f95..d3f5bf6852b3b295f3b5806b0577a880b0ce6ba6 100644 --- a/cmake/cross_compiling/ios.cmake +++ b/cmake/cross_compiling/ios.cmake @@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform") # Set the architecture for iOS if(NOT DEFINED IOS_ARCH) if(IOS_PLATFORM STREQUAL "OS") - # FIXME(liuyiqun): support "armv7;armv7s;arm64" future - set(IOS_ARCH "arm64") + set(IOS_ARCH "armv7;armv7s;arm64") elseif(IOS_PLATFORM STREQUAL "SIMULATOR") - # FIXME(liuyiqun): support "i386;x86_64" future - set(IOS_ARCH "x86_64") + set(IOS_ARCH "i386;x86_64") endif() endif() set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") @@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_ # Hidden visibilty is required for cxx on iOS set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags") -set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags") +set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags") set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake new file mode 100644 index 0000000000000000000000000000000000000000..6bea7cf3022242ce48cc882915f7e71810937283 --- /dev/null +++ b/cmake/cuda.cmake @@ -0,0 +1,188 @@ +if(NOT WITH_GPU) + return() +endif() + +set(paddle_known_gpu_archs "30 35 50 52 60 61 70") +set(paddle_known_gpu_archs7 "30 35 50 52") +set(paddle_known_gpu_archs8 "30 35 50 52 60 61") + +###################################################################################### +# A function for automatic detection of GPUs installed (if autodetection is enabled) +# Usage: +# detect_installed_gpus(out_variable) +function(detect_installed_gpus out_variable) + if(NOT CUDA_gpu_detect_output) + set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) + + file(WRITE ${cufile} "" + "#include \n" + "int main() {\n" + " int count = 0;\n" + " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" + " if (count == 0) return -1;\n" + " for (int device = 0; device < count; ++device) {\n" + " cudaDeviceProp prop;\n" + " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" + " std::printf(\"%d.%d \", prop.major, prop.minor);\n" + " }\n" + " return 0;\n" + "}\n") + + execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}" + "--run" "${cufile}" + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(nvcc_res EQUAL 0) + # only keep the last line of nvcc_out + STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}") + STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}") + list(GET nvcc_out -1 nvcc_out) + string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}") + set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE) + endif() + endif() + + if(NOT CUDA_gpu_detect_output) + message(STATUS "Automatic GPU detection failed. Building for all known architectures.") + set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE) + else() + set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) + endif() +endfunction() + + +######################################################################## +# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME +# Usage: +# select_nvcc_arch_flags(out_variable) +function(select_nvcc_arch_flags out_variable) + # List of arch names + set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual") + set(archs_name_default "All") + if(NOT CMAKE_CROSSCOMPILING) + list(APPEND archs_names "Auto") + endif() + + # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui) + set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.") + set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} ) + mark_as_advanced(CUDA_ARCH_NAME) + + # verify CUDA_ARCH_NAME value + if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};") + string(REPLACE ";" ", " archs_names "${archs_names}") + message(FATAL_ERROR "Only ${archs_names} architeture names are supported.") + endif() + + if(${CUDA_ARCH_NAME} STREQUAL "Manual") + set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") + set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") + mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX) + else() + unset(CUDA_ARCH_BIN CACHE) + unset(CUDA_ARCH_PTX CACHE) + endif() + + if(${CUDA_ARCH_NAME} STREQUAL "Kepler") + set(cuda_arch_bin "30 35") + elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") + set(cuda_arch_bin "50") + elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") + set(cuda_arch_bin "60 61") + elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") + set(cuda_arch_bin "70") + elseif(${CUDA_ARCH_NAME} STREQUAL "All") + set(cuda_arch_bin ${paddle_known_gpu_archs}) + elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") + detect_installed_gpus(cuda_arch_bin) + else() # (${CUDA_ARCH_NAME} STREQUAL "Manual") + set(cuda_arch_bin ${CUDA_ARCH_BIN}) + endif() + + # remove dots and convert to lists + string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}") + string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}") + string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}") + string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}") + list(REMOVE_DUPLICATES cuda_arch_bin) + list(REMOVE_DUPLICATES cuda_arch_ptx) + + set(nvcc_flags "") + set(nvcc_archs_readable "") + + # Tell NVCC to add binaries for the specified GPUs + foreach(arch ${cuda_arch_bin}) + if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)") + # User explicitly specified PTX for the concrete BIN + list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) + list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1}) + else() + # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN + list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch}) + list(APPEND nvcc_archs_readable sm_${arch}) + endif() + endforeach() + + # Tell NVCC to add PTX intermediate code for the specified architectures + foreach(arch ${cuda_arch_ptx}) + list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch}) + list(APPEND nvcc_archs_readable compute_${arch}) + endforeach() + + string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}") + set(${out_variable} ${nvcc_flags} PARENT_SCOPE) + set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE) +endfunction() + +message(STATUS "CUDA detected: " ${CUDA_VERSION}) +if (${CUDA_VERSION} LESS 7.0) + set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) +elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") +elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + # CUDA 8 may complain that sm_20 is no longer supported. Suppress the + # warning for now. + list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") +endif() + +include_directories(${CUDA_INCLUDE_DIRS}) +list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) +if(NOT WITH_DSO) + list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) +endif(NOT WITH_DSO) + +# setting nvcc arch flags +select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) +list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) +message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}") + +# Set C++11 support +set(CUDA_PROPAGATE_HOST_FLAGS OFF) + +# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. +# So, don't set these flags here. +list(APPEND CUDA_NVCC_FLAGS "-std=c++11") +list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") +list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") +# Set :expt-relaxed-constexpr to suppress Eigen warnings +list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") + +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) +elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) +elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) +elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) +endif() + +mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) +mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 5a06825beb73e85d8a55b7b578b187bee2c4340c..fc52d339d7a336b44c97f2e0a9fc8d6604854365 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) IF(${CBLAS_PROVIDER} STREQUAL "MKLML") SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) - SET(MKLDNN_MKLROOT ${MKLML_ROOT}) - SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB}) - SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR}) - MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}") + MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}") +ELSE() + MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN") ENDIF() SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow") @@ -57,15 +56,16 @@ ExternalProject_Add( PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} - CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT} + CMAKE_ARGS -DMKLROOT=${MKLML_ROOT} CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} - -DMKLROOT:PATH=${MKLDNN_MKLROOT} + -DMKLROOT:PATH=${MKLML_ROOT} ) ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) -MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}") +MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") +add_definitions(-DPADDLE_USE_MKLDNN) LIST(APPEND external_project_dependencies mkldnn) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 324e29f931ecbb6beab2d363daa01a19b1a56b3e..4c4f59656dae68739f2f07f3febd510e727fe2dd 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND}) "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "openblas library." FORCE) - SET(OPENBLAS_CC "${CMAKE_C_COMPILER}") + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") IF(CMAKE_CROSSCOMPILING) SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER}) @@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND}) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0) ENDIF() ELSEIF(IOS) - # FIXME(liuyiqun): support multiple architectures - SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") - SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") - IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7") - SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7") - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0) - ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64") + IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64") + SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") + SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64") SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX}) + ELSE() + MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. " + "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.") ENDIF() ELSEIF(RPI) # use hardfp diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 8bd058222880b4df3b08da09c02f9fe7f1d0ee66..a8e1aca49c97df256b1269c286b0bce7732fa932 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +IF(MOBILE_INFERENCE) + return() +ENDIF() + INCLUDE(ExternalProject) SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 4593ae6180b6d7deb61d897eb634b17ac0bb1683..2b125cef6aa8d1021afe8a7a0d232d84d36be4bc 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -149,58 +149,3 @@ endforeach() foreach(flag ${GPU_COMMON_FLAGS}) safe_set_nvflag(${flag}) endforeach() - - -set(CUDA_PROPAGATE_HOST_FLAGS OFF) - -# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. -# So, don't set these flags here. -LIST(APPEND CUDA_NVCC_FLAGS -std=c++11) -LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math) - -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) -elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) -elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) -endif() - -function(specify_cuda_arch cuda_version cuda_arch) - if(${cuda_version} VERSION_GREATER "8.0") - foreach(capability 61 62) - if(${cuda_arch} STREQUAL ${capability}) - list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}") - endif() - endforeach() - elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53") - list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}") - endif() -endfunction() - -# Common gpu architectures: Kepler, Maxwell -foreach(capability 30 35 50) - list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}") -endforeach() - -if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0") - list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52") -endif() - -# Modern gpu architectures: Pascal -if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0") - list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60") - list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr) -endif() - -# Custom gpu architecture -set(CUDA_ARCH) - -if(CUDA_ARCH) - specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH}) -endif() - -set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS}) - diff --git a/cmake/util.cmake b/cmake/util.cmake index 117ab7f49cdf4a568cd203b2b17767643d0b2d50..ad905ab55ba3537054fa5b30b5fca4d83c406702 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME) target_link_libraries(${TARGET_NAME} log) endif(ANDROID) - if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR) - target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed") + if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB) + target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") endif() add_dependencies(${TARGET_NAME} ${external_project_dependencies}) diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index 203506d7ab84e5a5be2232b077eac2d433a99766..b2b55ec419d2f8453e067f202f6c1b7da6c201de 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -335,6 +335,16 @@ bilinear_interp .. autoclass:: paddle.v2.layer.bilinear_interp :noindex: +dot_prod +--------- +.. autoclass:: paddle.v2.layer.dot_prod + :noindex: + +out_prod +-------- +.. autoclass:: paddle.v2.layer.out_prod + :noindex: + power ----- .. autoclass:: paddle.v2.layer.power diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD index 16236763a73770f3fe5eadf67645765d0456f875..ec6d4681836e189f46dbb9b915a237dc15cda7cf 100644 --- a/doc/design/mkldnn/README.MD +++ b/doc/design/mkldnn/README.MD @@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA. 我们把集成方案大致分为了如下几个方面。 ### CMake -我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项,当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。 +我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关,他是负责`WITH_MKLML`和`WITH_MKLDNN`的总开关。 -同时,我们会引入`WITH_MKLML`选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。 +当打开`WITH_MKL`时,会开启MKLML的功能,作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上,同时会开启MKL-DNN功能。 -所以,我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。 +当关闭`WITH_MKL`时,MKLML和MKL-DNN功能会同时关闭。 -**备注**:当`WITH_MKLML=ON`的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动`cmake/cblas.cmake`中的逻辑。 +所以,我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。 ### Layers 所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在 diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst index 731a63f945c29ba78538b3d71289b234e569354d..61f3a223547b352cf7929615cf3682b29b9a738f 100644 --- a/doc/howto/dev/write_docs_cn.rst +++ b/doc/howto/dev/write_docs_cn.rst @@ -34,7 +34,7 @@ PaddlePaddle的文档构建有两种方式。 cd TO_YOUR_PADDLE_CLONE_PATH mkdir -p build cd build - cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON + cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON make gen_proto_py make paddle_docs paddle_docs_cn diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md index 882066f23714f7ab3bba9199b5fa5ff2325ce849..424d7718c64438496cf0895397babd5408e1ca02 100644 --- a/doc/mobile/cross_compiling_for_android_cn.md +++ b/doc/mobile/cross_compiling_for_android_cn.md @@ -1,4 +1,4 @@ -# 构建Android平台上的PaddlePaddle库 +# Android平台编译指南 用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库: - 基于Docker容器的编译方式 diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md index cda636a67de712e072f4cc7ad859dda75211eaa8..9da48e7f2119ce901fbb3abab73400df27be16d2 100644 --- a/doc/mobile/cross_compiling_for_ios_cn.md +++ b/doc/mobile/cross_compiling_for_ios_cn.md @@ -1,4 +1,4 @@ -# 构建iOS平台上的PaddlePaddle库 +# iOS平台编译指南 交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。 ## 准备交叉编译环境 @@ -25,7 +25,7 @@ iOS平台可选配置参数: - `IOS_PLATFORM`,可设置为`OS/SIMULATOR`,默认值为`OS`。 - `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。 - `SIMULATOR`,构建目标为`x86`架构的模拟器平台。 -- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示: +- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示,默认编译所有架构: @@ -41,11 +41,11 @@ iOS平台可选配置参数: - + - +
OSarmv7, armv7s, arm64 (默认)armv7, armv7s, arm64
SIMULATORi386, x86_64 (默认)i386, x86_64
@@ -66,7 +66,7 @@ iOS平台可选配置参数: ```bash cmake -DCMAKE_SYSTEM_NAME=iOS \ -DIOS_PLATFORM=OS \ - -DIOS_ARCH="arm64" \ + -DIOS_ARCH="armv7;arm64" \ -DIOS_ENABLE_BITCODE=ON \ -DIOS_USE_VECLIB_FOR_BLAS=ON \ -DCMAKE_INSTALL_PREFIX=your/path/to/install \ @@ -112,6 +112,6 @@ $ make install - `lib`目录,其中包含PaddlePaddle的C-API静态库 - `third_party`目录,其中包含所依赖的所有第三方库 -注意,不同架构的PaddlePaddle库建议安装到不同的目录下,然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。 +注意,如果PaddlePaddle库需要同时支持真机和模拟器,则需要分别编译真机和模拟器版本,然后使用`lipo`工具合并fat库。 自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。 diff --git a/doc/mobile/cross_compiling_for_raspberry_cn.md b/doc/mobile/cross_compiling_for_raspberry_cn.md index 6e983645faaed1f67edaeeb82ddbef9cef6bb85f..f8ef9dc8031613831437745995268f3abc392f5b 100644 --- a/doc/mobile/cross_compiling_for_raspberry_cn.md +++ b/doc/mobile/cross_compiling_for_raspberry_cn.md @@ -1,4 +1,4 @@ -# 构建Raspberry Pi平台上的PaddlePaddle库 +# Raspberry Pi平台编译指南 通常有两个方法来构建基于 Rasspberry Pi 的版本: diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h index ede2670882ee2b93f610a2261a4ecc1784bc2d0c..4ab8de80d1c7be0f8e3eb848955373dd5e21bc18 100644 --- a/paddle/cuda/include/hl_gpu.h +++ b/paddle/cuda/include/hl_gpu.h @@ -25,7 +25,9 @@ limitations under the License. */ #include "hl_matrix.h" #include "hl_sequence.h" #include "hl_sparse.h" +#ifndef PADDLE_MOBILE_INFERENCE #include "hl_warpctc_wrap.h" +#endif #ifdef HPPL_STUB_FUNC #include "stub/hl_aggregate_stub.h" diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index b3b9c45ded95ce2e735b8898d47760956dcacdce..00d9dd238ec5328be28f58f8118daad3a039e08c 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -270,6 +270,19 @@ static bool AllGradInSet(const std::vector& names, return false; } } + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + sout << "All input {"; + for (auto& name : names) { + sout << name << ","; + } + sout << "} is in {"; + for (auto& name : set) { + sout << name << ","; + } + sout << "}"; + VLOG(10) << sout.str(); + } return true; } @@ -290,14 +303,12 @@ static void CreateGradVarInBlock( auto ops = block_desc->AllOps(); for (size_t op_index = grad_op_start_index; op_index < ops.size(); ++op_index) { - bool need_infer_shape = false; std::unordered_set new_vars; ForEachVarName(ops[op_index]->Outputs(), [&](const std::string& grad_var_name) { if (block_desc->HasVar(grad_var_name)) { return false; } - need_infer_shape = true; auto var = block_desc->Var(grad_var_name); new_vars.insert(var->Name()); auto it = param_name_map.find(grad_var_name); @@ -311,23 +322,21 @@ static void CreateGradVarInBlock( grad_record.op_idx_ = static_cast(op_index); return false; /* not break */ }); - if (need_infer_shape) { - ops[op_index]->InferVarType(block_desc); - for (auto& arg : ops[op_index]->OutputArgumentNames()) { - if (new_vars.find(arg) == new_vars.end()) { - continue; - } - auto pname = FwdName(arg); - auto* param = block_desc->FindVarRecursive(pname); - auto* grad = block_desc->FindVar(arg); - if (param == nullptr) { - grad->SetDataType(DataType::FP32); - } else { - grad->SetDataType(param->GetDataType()); - } + ops[op_index]->InferVarType(block_desc); + for (auto& arg : ops[op_index]->OutputArgumentNames()) { + if (new_vars.find(arg) == new_vars.end()) { + continue; + } + auto pname = FwdName(arg); + auto* param = block_desc->FindVarRecursive(pname); + auto* grad = block_desc->FindVar(arg); + if (param == nullptr) { + grad->SetDataType(DataType::FP32); + } else { + grad->SetDataType(param->GetDataType()); } - ops[op_index]->InferShape(*block_desc); } + ops[op_index]->InferShape(*block_desc); } } @@ -387,6 +396,7 @@ std::vector> MakeBlockBackward( ProgramDescBind& program_desc, int block_idx, std::unordered_set* no_grad_vars, std::unordered_map* grad_to_var) { + VLOG(5) << "MakeBlockBackward"; BlockDescBind* cur_block = program_desc.MutableBlock(block_idx); std::vector op_descs = cur_block->AllOps(); std::unordered_map> dup_out_ops; @@ -394,9 +404,10 @@ std::vector> MakeBlockBackward( std::vector> backward_descs; for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) { + VLOG(5) << "Making backward " << (*it)->Type() << " op"; std::vector> op_grads; - if ((*it)->Type() == "recurrent") { + if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") { int step_block_idx = (*it)->GetBlockAttr("step_block"); BlockDescBind* backward_block = CreateStepBlock( program_desc, no_grad_vars, grad_to_var, step_block_idx); @@ -410,6 +421,15 @@ std::vector> MakeBlockBackward( op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var); } + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + sout << "Made "; + for (auto& op_grad : op_grads) { + sout << op_grad->Type() << " "; + } + VLOG(10) << sout.str(); + } + for (const auto& desc : op_grads) { for (const std::string& out_name : desc->OutputArgumentNames()) { if (out_name.find("@GRAD") == std::string::npos) { @@ -425,6 +445,8 @@ std::vector> MakeBlockBackward( op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs), [](std::unique_ptr& ptr) { return std::move(ptr); }); } + + VLOG(5) << "Appending Sums"; // Check whether some variables are written more than once std::list>> pending_sum_ops; for (const auto& dup : dup_out_ops) { @@ -432,16 +454,22 @@ std::vector> MakeBlockBackward( const std::vector dup_op = dup.second; if (out_name != kEmptyVarName && dup_op.size() > 1) { std::vector sum_op_inputs; + std::string next_g_name = out_name; for (size_t i = 0; i < dup_op.size(); ++i) { + VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name + << " duplicated"; std::string new_name = out_name + "@RENAME@" + std::to_string(i); - backward_descs[dup_op[i]]->Rename(out_name, new_name); + backward_descs[dup_op[i]]->RenameOutput(out_name, new_name); + backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name); sum_op_inputs.emplace_back(new_name); + next_g_name = sum_op_inputs.back(); } std::unique_ptr sum_op(new OpDescBind( "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {})); pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)}); } } + pending_sum_ops.sort( [](const std::pair>& a, const std::pair>& b) { @@ -452,6 +480,8 @@ std::vector> MakeBlockBackward( std::move(p.second)); } + VLOG(5) << "MakeBlockBackward Finished"; + return backward_descs; } diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h index 3ec88d7a72c3339bf5e7d0ca3957a3f608f039b7..be144d8fc0104fccc08006532a85906ade25c2a1 100644 --- a/paddle/framework/data_type.h +++ b/paddle/framework/data_type.h @@ -29,6 +29,8 @@ inline DataType ToDataType(std::type_index type) { return DataType::INT32; } else if (typeid(int64_t).hash_code() == type.hash_code()) { return DataType::INT64; + } else if (typeid(bool).hash_code() == type.hash_code()) { + return DataType::BOOL; } else { PADDLE_THROW("Not supported"); } diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc index 53b899a23997b71e723a298ec360a4e018d89878..8b6f42b82df14bfcd25f33ef16b5903fb965a8ba 100644 --- a/paddle/framework/ddim.cc +++ b/paddle/framework/ddim.cc @@ -60,8 +60,7 @@ void make_ddim(DDim& ddim, const int64_t* dims, int n) { ddim = make_dim<9>(dims); break; default: - throw std::invalid_argument( - "Dynamic dimensions must have between [1, 9] dimensions."); + PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions."); } } diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 2fcf41d69f0011b0d9a3d89c97fcebacb0703e97..adedd8cb0e8504fd6fc924e62a2ede3c1c7ce698 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -120,6 +120,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id, for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); + VLOG(10) << op->DebugString(); op->Run(*local_scope, *device); } if (create_local_scope) { diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index 39c8def82e1ebb10a0e357a648af760099020c32..48cd131550dea5ad3f368b25c31d753efbe0dff9 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -235,6 +235,23 @@ void OpDescBind::Rename(const std::string &old_name, need_update_ = true; } +void OpDescBind::RenameOutput(const std::string &old_name, + const std::string &new_name) { + for (auto &output : outputs_) { + std::replace(output.second.begin(), output.second.end(), old_name, + new_name); + } + need_update_ = true; +} + +void OpDescBind::RenameInput(const std::string &old_name, + const std::string &new_name) { + for (auto &input : inputs_) { + std::replace(input.second.begin(), input.second.end(), old_name, new_name); + } + need_update_ = true; +} + struct SetAttrDescVisitor : public boost::static_visitor { explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {} mutable OpDesc::Attr *attr_; @@ -448,7 +465,12 @@ const std::vector &CompileTimeInferShapeContext::Outputs( DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const { auto var = block_.FindVarRecursive(name); PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); - return framework::make_ddim(var->Shape()); + try { + return framework::make_ddim(var->Shape()); + } catch (...) { + VLOG(5) << "GetDim of variable " << name << " error"; + std::rethrow_exception(std::current_exception()); + } } void CompileTimeInferShapeContext::SetDim(const std::string &name, diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h index e3e96441bbf51729f2ba69c9257e6961b1de0d5c..da032319afa775571d3942bf6ae415db7d233735 100644 --- a/paddle/framework/op_desc.h +++ b/paddle/framework/op_desc.h @@ -73,6 +73,10 @@ class OpDescBind { void Rename(const std::string &old_name, const std::string &new_name); + void RenameOutput(const std::string &old_name, const std::string &new_name); + + void RenameInput(const std::string &old_name, const std::string &new_name); + // Only be used in C++ const AttributeMap &GetAttrMap() const; diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 3276f8af396fe58450a8dc6713fe61e49d5ca708..93467ab8ac796277b47a861a427de2837fb2d3d4 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -403,19 +403,6 @@ class RuntimeInferShapeContext : public InferShapeContext { void OperatorWithKernel::Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const { - if (VLOG_IS_ON(1)) { - auto inputs = this->InputVars(); - auto outputs = this->OutputVars(true); - std::ostringstream sout; - sout << "Run operator " << this->Type() << " From ["; - std::ostream_iterator out_it(sout, ","); - std::copy(inputs.begin(), inputs.end(), out_it); - sout << "] to ["; - std::copy(outputs.begin(), outputs.end(), out_it); - sout << "]"; - VLOG(1) << sout.str(); - } - RuntimeInferShapeContext infer_shape_ctx(*this, scope); this->InferShape(&infer_shape_ctx); diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc index 9428b8a07ea0af005f6e960ddaa02da624ad9d97..9ad6272c99dd6a85520ae44c1331ac232bc6a9a2 100644 --- a/paddle/framework/scope.cc +++ b/paddle/framework/scope.cc @@ -38,11 +38,12 @@ Scope& Scope::NewScope() const { Variable* Scope::Var(const std::string& name) { auto iter = vars_.find(name); if (iter != vars_.end()) { + VLOG(3) << "Get existing variable " << name; return iter->second; } Variable* v = new Variable(); vars_[name] = v; - VLOG(3) << "Create variable " << name << " on scope"; + VLOG(3) << "Create variable " << name; v->name_ = &(vars_.find(name)->first); return v; } diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h index 7d36ead2ca85328c7843b3b5d423cf8e921d1c93..05dc47f06ac81f0acb6d0317cbecb3009c7dd7f0 100644 --- a/paddle/framework/shape_inference.h +++ b/paddle/framework/shape_inference.h @@ -53,6 +53,10 @@ class InferShapeContext { virtual bool IsRuntime() const = 0; + // Note: In while op, we need this to be public + void SetDims(const std::vector &names, + const std::vector &dims); + protected: virtual framework::DDim GetDim(const std::string &name) const = 0; virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0; @@ -60,9 +64,6 @@ class InferShapeContext { std::vector GetDims( const std::vector &names) const; - void SetDims(const std::vector &names, - const std::vector &dims); - std::vector GetVarTypes( const std::vector &names) const; diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt index 91d732641a4a5eed050841b59fd10da397eb732f..41ead3c5ecef248830cfb0f8be360f21dcd58e7b 100644 --- a/paddle/gserver/CMakeLists.txt +++ b/paddle/gserver/CMakeLists.txt @@ -73,7 +73,6 @@ if(MOBILE_INFERENCE) list(REMOVE_ITEM GSERVER_SOURCES dataproviders/DataProvider.cpp dataproviders/MultiDataProvider.cpp - dataproviders/ProtoDataProvider.cpp dataproviders/PyDataProvider2.cpp dataproviders/PyDataProvider.cpp) diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp index 0478256f9cd81f4a99eb0cbcbd1a5a21de5cf14b..106cf5b6228e636026ded558d0f591022f1ae586 100644 --- a/paddle/gserver/dataproviders/DataProvider.cpp +++ b/paddle/gserver/dataproviders/DataProvider.cpp @@ -16,8 +16,8 @@ limitations under the License. */ #include #include -#include "ProtoDataProvider.h" #include "paddle/utils/Logging.h" +#include "paddle/utils/Stat.h" #include "paddle/utils/StringUtil.h" #include "paddle/utils/Util.h" @@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config, REGISTER_DATA_PROVIDER(simple, SimpleDataProvider); REGISTER_DATA_PROVIDER(dummy, DummyDataProvider); -REGISTER_DATA_PROVIDER(proto, ProtoDataProvider); -REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider); int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) { int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch) diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp deleted file mode 100644 index c6f5cab1915b7f41d505c37a7fef762a392bad7f..0000000000000000000000000000000000000000 --- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp +++ /dev/null @@ -1,932 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "ProtoDataProvider.h" -#include -#include -#include -#include "paddle/utils/StringUtil.h" -#include "paddle/utils/Util.h" - -#include "DataProviderGroup.h" -#include "paddle/utils/Logging.h" - -DEFINE_double(memory_threshold_on_load_data, - 1.0, - "stop loading data when memory is not sufficient"); - -namespace paddle { - -REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup); -REGISTER_DATA_PROVIDER(proto_sequence_group, - DataProviderGroup); - -ProtoDataProvider::ProtoDataProvider(const DataConfig& config, - bool useGpu, - bool loadDataAll) - : DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) { - if (loadDataAll) { - loadData(config_.files()); - } -} - -void ProtoDataProvider::loadData(const std::vector& fileList) { - for (auto& file : fileList) { - if (FLAGS_memory_threshold_on_load_data < 1.0) { - double memUsage = getMemoryUsage(); - if (memUsage > FLAGS_memory_threshold_on_load_data) { - LOG(INFO) << "memUsage is " << memUsage << ", > " - << FLAGS_memory_threshold_on_load_data - << " therefore SKIP ALL REMAINING file."; - break; - } - } - LOG(INFO) << "load data file " << file; - loadDataFile(file); - } - - if (sequenceStartPositions_.size() == sampleNums_) { - // This means that each sample is one sequence - shuffledSequenceIds_.swap(sequenceStartPositions_); - } else { - sequenceStartPositions_.push_back(sampleNums_); - shuffledSequenceIds_.reserve(sequenceStartPositions_.size() - 1); - for (size_t i = 0; i < sequenceStartPositions_.size() - 1; ++i) { - shuffledSequenceIds_.push_back(i); - } - } - - LOG(INFO) << "read done, num of instance=" << sampleNums_; - showDataStats(); -} - -void ProtoDataProvider::loadData(const std::string& fileName) { - std::vector fileList; - loadFileList(fileName, fileList); - loadData(fileList); -} - -void ProtoDataProvider::checkDataHeader(const DataHeader& header) { - if (header_.slot_defs_size()) { - // header_ is already set. Need to check consistency. - CHECK_EQ(header_.slot_defs_size(), header.slot_defs_size()) - << "Different header"; - for (int i = 0; i < header.slot_defs_size(); ++i) { - CHECK_EQ(header_.slot_defs(i).type(), header.slot_defs(i).type()); - CHECK_EQ(header_.slot_defs(i).dim(), header.slot_defs(i).dim()); - } - return; - } - - // header_ is not set before - CHECK(header.slot_defs_size()) << "Invalid header: no slot is defined"; - int i; - for (i = 0; i < header.slot_defs_size(); ++i) { - if (header.slot_defs(i).type() == SlotDef::INDEX || - header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX) { - break; - } - constexpr int kBufLen = 100; - char buf[kBufLen]; - snprintf(buf, kBufLen, "slot%d_nnz", i); - nnzStats_.push_back(getStat(buf)); - } - numVecSlots_ = i; - - // Check that INDEX slots are after VECTOR slots - for (int i = numVecSlots_; i < header.slot_defs_size(); ++i) { - CHECK(header.slot_defs(i).type() == SlotDef::INDEX || - header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX); - } - - slots_.clear(); - slots_.reserve(header.slot_defs_size()); - for (int i = 0; i < header.slot_defs_size(); ++i) { - slots_.emplace_back(); - slots_.back().type = header.slot_defs(i).type(); - slots_.back().dim = header.slot_defs(i).dim(); - if (SlotDef::VECTOR_SPARSE_NON_VALUE == header.slot_defs(i).type() || - SlotDef::VECTOR_SPARSE_VALUE == header.slot_defs(i).type()) { - slots_.back().indices.push_back(0); - } - } - - header_ = header; -} - -void ProtoDataProvider::checkSample(const DataSample& sample) { - CHECK_EQ(numVecSlots_, sample.vector_slots_size()); - CHECK(header_.slot_defs_size() == numVecSlots_ + sample.id_slots_size() || - header_.slot_defs_size() == numVecSlots_ + sample.var_id_slots_size()); - for (int i = 0; i < numVecSlots_; ++i) { - uint32_t dim = header_.slot_defs(i).dim(); - switch (header_.slot_defs(i).type()) { - case SlotDef::VECTOR_DENSE: { - CHECK_EQ(static_cast(dim), sample.vector_slots(i).values_size()); - CHECK_EQ(0, sample.vector_slots(i).ids_size()); - break; - } - case SlotDef::VECTOR_SPARSE_NON_VALUE: { - if (0 == sample.vector_slots(i).ids_size()) { - break; - } - CHECK_LT(0, sample.vector_slots(i).ids_size()); - CHECK_EQ(0, sample.vector_slots(i).values_size()); - auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(), - sample.vector_slots(i).ids().end()); - CHECK_GT(dim, maxId); - break; - } - case SlotDef::VECTOR_SPARSE_VALUE: { - if (0 == sample.vector_slots(i).ids_size()) { - CHECK_EQ(0, sample.vector_slots(i).values_size()); - break; - } - CHECK_LT(0, sample.vector_slots(i).values_size()); - CHECK_GE(static_cast(dim), sample.vector_slots(i).values_size()); - CHECK_EQ(sample.vector_slots(i).values_size(), - sample.vector_slots(i).ids_size()); - auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(), - sample.vector_slots(i).ids().end()); - CHECK_GT(dim, maxId); - break; - } - case SlotDef::VAR_MDIM_DENSE: { - if (static_cast(dim) != 0) { - CHECK_EQ(static_cast(dim), sample.vector_slots(i).values_size()); - if (sample.vector_slots(i).dims_size() != 0) { - int totalDim = sample.vector_slots(i).dims(0); - for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) { - totalDim *= sample.vector_slots(i).dims(j); - } - CHECK_EQ(static_cast(dim), totalDim); - } - } else { - CHECK_NE(sample.vector_slots(i).dims_size(), 0); - int totalDim = sample.vector_slots(i).dims(0); - for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) { - totalDim *= sample.vector_slots(i).dims(j); - } - CHECK_EQ(totalDim, sample.vector_slots(i).values_size()); - } - break; - } - case SlotDef::STRING: { - CHECK_EQ(static_cast(1), sample.vector_slots(i).strs_size()); - CHECK_EQ(0, sample.vector_slots(i).ids_size()); - CHECK_EQ(0, sample.vector_slots(i).values_size()); - break; - } - default: - LOG(FATAL) << "BUG: Should not reach here"; - } - } - for (int i = numVecSlots_; i < header_.slot_defs_size(); ++i) { - if (header_.slot_defs(i).type() != SlotDef::VAR_MDIM_INDEX) { - uint32_t id = sample.id_slots(i - numVecSlots_); - if (id == -1U) continue; - CHECK_LT(id, header_.slot_defs(i).dim()); - } else { - for (int j = 0; j < sample.var_id_slots(i - numVecSlots_).ids_size(); - ++j) { - uint32_t id = sample.var_id_slots(i - numVecSlots_).ids(j); - CHECK_LT(id, header_.slot_defs(i).dim()); - } - } - } -} - -void ProtoDataProvider::loadDataFile(const std::string& fileName) { - std::ifstream is(fileName); - CHECK(is) << "Fail to open " << fileName; - bool dataCompression = str::endsWith(fileName, ".gz"); - std::unique_ptr reader(new ProtoReader(&is, dataCompression)); - CHECK(reader) << "Fail to create proto data input stream"; - - DataHeader header; - CHECK(reader->read(&header)); - checkDataHeader(header); - - DataSample sample; - do { - if (!reader->read(&sample)) { - break; - } - checkSample(sample); - if (sample.is_beginning()) { - sequenceStartPositions_.push_back(sampleNums_); - } - fillSlots(sample); - ++sampleNums_; - } while (true); - - CHECK(is.eof()) << "Fail to read file"; - reader.reset(nullptr); - is.close(); -} - -// checkSample has done before, no check here -void ProtoDataProvider::fillSlots(const DataSample& sample) { - for (size_t i = 0; i < slots_.size(); ++i) { - auto& slot = slots_[i]; - int dim = slot.dim; - switch (slot.type) { - case SlotDef::VECTOR_DENSE: { - size_t oldSize = slot.denseData.size(); - slot.denseData.resize(oldSize + dim); - const float* values = sample.vector_slots(i).values().data(); -#ifdef PADDLE_TYPE_DOUBLE - std::copy(values, values + dim, slot.denseData.begin() + oldSize); -#else - memcpy(slot.denseData.data() + oldSize, values, sizeof(real) * dim); -#endif - break; - } - case SlotDef::VECTOR_SPARSE_NON_VALUE: { - int slotSize = sample.vector_slots(i).ids_size(); - int subSlotSize = 0; - int id = 0; // the slot id - // find whether this vector_slots has subseq. If not has subseq, - // subSlotSize = 0. - for (id = 0; id < sample.subseq_slots_size(); id++) { - if (sample.subseq_slots(id).slot_id() == i) { - subSlotSize = sample.subseq_slots(id).lens_size(); - break; - } - } - if (subSlotSize && slot.subIndices.size() == 0UL) { - // If has subSeq, the first element of subIndices = 0. - slot.subIndices.push_back(0); - } - if (slotSize == 0UL) { - // if has no id, new indices = old indices. - slot.indices.push_back(slot.indices.back()); - // if has subSeq, new subIndices = old subIndices. - if (slot.subIndices.size()) { - slot.subIndices.push_back(slot.subIndices.back()); - } - break; - } - slot.sparseNonValueData.resize(slot.indices.back() + slotSize); - const unsigned int* ids = sample.vector_slots(i).ids().data(); - memcpy(slot.sparseNonValueData.data() + slot.indices.back(), - ids, - sizeof(*ids) * slotSize); - slot.indices.push_back(slot.indices.back() + slotSize); - if (subSlotSize) { - for (int ii = 0; ii < subSlotSize; ++ii) { - slot.subIndices.push_back(slot.subIndices.back() + - sample.subseq_slots(id).lens(ii)); - } - } - break; - } - case SlotDef::VECTOR_SPARSE_VALUE: { - if (0 == sample.vector_slots(i).ids_size()) { - slot.indices.push_back(slot.indices.back()); - break; - } - int slotSize = sample.vector_slots(i).ids_size(); - slot.sparseFloatValueData.resize(slot.indices.back() + slotSize); - const unsigned int* ids = sample.vector_slots(i).ids().data(); - const float* values = sample.vector_slots(i).values().data(); - for (int ii = 0; ii < slotSize; ++ii) { - slot.sparseFloatValueData[slot.indices.back() + ii].col = ids[ii]; - slot.sparseFloatValueData[slot.indices.back() + ii].value = - values[ii]; - } - slot.indices.push_back(slot.indices.back() + slotSize); - break; - } - case SlotDef::INDEX: { - slot.indexData.push_back(sample.id_slots(i - numVecSlots_)); - break; - } - case SlotDef::VAR_MDIM_DENSE: { - size_t oldSize = slot.varDenseData.size(); - slot.varDenseData.resize(oldSize + 1); - size_t varDim = sample.vector_slots(i).values_size(); - slot.varDenseData[oldSize].data.resize(varDim); - const float* values = sample.vector_slots(i).values().data(); -#ifdef PADDLE_TYPE_DOUBLE - std::copy( - values, values + varDim, slot.varDenseData[oldSize].data.data()); -#else - memcpy(slot.varDenseData[oldSize].data.data(), - values, - sizeof(real) * varDim); -#endif - slot.varDenseData[oldSize].dims.resize( - sample.vector_slots(i).dims_size()); - memcpy(slot.varDenseData[oldSize].dims.data(), - sample.vector_slots(i).dims().data(), - sizeof(uint32_t) * sample.vector_slots(i).dims_size()); - break; - } - case SlotDef::VAR_MDIM_INDEX: { - size_t oldSize = slot.varIndices.size(); - slot.varIndices.resize(oldSize + 1); - size_t varDim = sample.var_id_slots(i - numVecSlots_).ids_size(); - slot.varIndices[oldSize].resize(varDim); - memcpy(slot.varIndices[oldSize].data(), - sample.var_id_slots(i - numVecSlots_).ids().data(), - sizeof(uint32_t) * varDim); - break; - } - case SlotDef::STRING: { - slot.strData.push_back(sample.vector_slots(i).strs(0)); - break; - } - } - } -} - -void ProtoDataProvider::showDataStats() { - std::ostringstream oss; - for (size_t i = 0; i < slots_.size(); ++i) { - auto& slot = slots_[i]; - if (slot.type == SlotDef::VECTOR_SPARSE_NON_VALUE) { - size_t nnz = slot.sparseNonValueData.size(); - oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; "; - } else if (slot.type == SlotDef::VECTOR_SPARSE_VALUE) { - size_t nnz = slot.sparseFloatValueData.size(); - oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; "; - } - } - LOG(INFO) << oss.str(); -} - -void ProtoDataProvider::reset() { - currentSequenceIndex_ = 0; - if (!skipShuffle_) { - shuffle(); - } - - DataProvider::reset(); -} - -void ProtoDataProvider::shuffle() { - std::shuffle(shuffledSequenceIds_.begin(), - shuffledSequenceIds_.end(), - ThreadLocalRandomEngine::get()); -} - -/* - Loop through sequences starting from currentSequenceIndex_ - for at most size samples. For each sequence ranging from [begin, end), - op(begin, end) will be called. - - return the number of sequences scanned -*/ -template -int64_t ProtoDataProvider::sequenceLoop(Op op, int64_t size) { - int64_t sz = 0; - size_t i; - size_t sequenceCount = shuffledSequenceIds_.size(); - if (usageRatio_ < 1.0f) { - sequenceCount = static_cast(sequenceCount * usageRatio_); - } - for (i = currentSequenceIndex_; i < sequenceCount; ++i) { - size_t id = shuffledSequenceIds_[i]; - int64_t begin = sequenceStartPositions_[id]; - int64_t end = sequenceStartPositions_[id + 1]; - int64_t len = end - begin; - if (sz + len > size && sz > 0) break; - sz += len; - op(begin, end); - } - return i - currentSequenceIndex_; -} - -/* - Loop through sequences starting from currentSequenceIndex_ - for at most size samples. For each sample of each sequence at position - pos, op(pos) will be called. - - return the number of sequences scanned -*/ -template -int64_t ProtoDataProvider::sampleLoop(Op op, int64_t size) { - if (iidData()) { - size = std::min(sampleNums_ - currentSequenceIndex_, size); - for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size; - ++i) { - size_t pos = shuffledSequenceIds_[i]; - op(pos); - } - return size; - } else { - auto f = [op](int64_t begin, int64_t end) { - for (int64_t pos = begin; pos < end; ++pos) { - op(pos); - } - }; - return sequenceLoop(f, size); - } -} - -/* - Loop through sub-sequences starting from currentSequenceIndex_ - for at most size samples. For each sample of each sub-sequence at position - pos, op(pos) will be called. - - return the number of sub-sequences scanned -*/ -template -int64_t ProtoDataProvider::subSampleLoop(Op op, int64_t size, int slot) { - CHECK(iidData()) << "subSampleLoop only accepts iid data"; - size = std::min(sampleNums_ - currentSequenceIndex_, size); - int subSize = 0; - for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size; - ++i) { - size_t pos = shuffledSequenceIds_[i]; - int64_t* indexs = slots_[slot].indices.data(); - int64_t* subIndexs = slots_[slot].subIndices.data(); - int64_t subSeqStart = 0; - int64_t subSeqEnd = 0; - for (int j = 0; j < (int)slots_[slot].subIndices.size(); j++) { - if (subIndexs[j] == indexs[pos]) { - subSeqStart = j; - if (subIndexs[pos] == subIndexs[pos + 1]) { - subSeqEnd = j + 1; - break; - } - } else if (subIndexs[j] == indexs[pos + 1]) { - subSeqEnd = j; - break; - } - } - for (int j = subSeqStart; j < subSeqEnd; j++) { - op(j); - } - subSize += subSeqEnd - subSeqStart; - } - return subSize; -} - -int64_t ProtoDataProvider::getNextBatchInternal(int64_t size, - DataBatch* batch) { - int64_t numSequences = 0; // actual number of sequences in the batch - - // the number of sequences scanned, including those skipped because too long - int64_t numScannedSeqs = 0; - std::lock_guard guard(lock_); - if (iidData()) { - size = std::min(getSize() - currentSequenceIndex_, size); - numScannedSeqs = numSequences = size; - } else { - int64_t sz = 0; - auto op = [&sz, &numSequences](int64_t begin, int64_t end) { - ++numSequences; - sz += end - begin; - }; - numScannedSeqs = sequenceLoop(op, size); - VLOG_IF(1, numScannedSeqs > numSequences) - << numScannedSeqs - numSequences - << " sequences are skipped because longer than " << size; - size = sz; - } - if (size <= 0) return 0; - - DataBatch& cpuBatch = *cpuBatch_; - std::vector& cpuArguments = cpuBatch.getStreams(); - cpuBatch.setSize(size); - cpuArguments.resize(header_.slot_defs_size()); - - if (!iidData()) { - ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions, - numSequences + 1, - /* useGpu= */ false); - int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false); - int pos = 0; - int i = 0; - auto op = [buf, &pos, &i](int64_t begin, int64_t end) { - buf[i] = pos; - pos += end - begin; - ++i; - }; - sequenceLoop(op, size); - buf[i] = size; - for (size_t slot = 1; slot < cpuArguments.size(); ++slot) { - cpuArguments[slot].sequenceStartPositions = - cpuArguments[0].sequenceStartPositions; - } - } - - for (int slot = 0; slot < header_.slot_defs_size(); ++slot) { - size_t dim = header_.slot_defs(slot).dim(); - SlotDef::SlotType slotType = header_.slot_defs(slot).type(); - - std::vector dataPos; - dataPos.reserve(size); - auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); }; - sampleLoop(op, size); - - switch (slotType) { - case SlotDef::VECTOR_DENSE: { - Matrix::resizeOrCreate(cpuArguments[slot].value, - size, - dim, - false, // trans = false - false); // useGpu = false - real* buf = cpuArguments[slot].value->getData(); - for (int i = 0; i < size; ++i) { - memcpy(buf + i * dim, - slots_[slot].denseData.data() + dataPos[i] * dim, - sizeof(real) * dim); - } - break; - } - case SlotDef::VECTOR_SPARSE_NON_VALUE: { - if (!(cpuArguments[slot].value)) { - cpuArguments[slot].value = - Matrix::createSparseMatrix(size, - dim, - size /*DEFAULT_AVG_WIDTH = 1*/, - NO_VALUE, - SPARSE_CSR, - false, - useGpu_); - } - auto mat = cpuArguments[slot].value; - mat->resize(size, dim); - if (std::dynamic_pointer_cast(mat)) { - std::dynamic_pointer_cast(mat)->copyFrom( - dataPos.data(), - slots_[slot].indices.data(), - slots_[slot].sparseNonValueData.data(), - HPPL_STREAM_1); - } else if (std::dynamic_pointer_cast(mat)) { - std::dynamic_pointer_cast(mat)->copyFrom( - dataPos.data(), - slots_[slot].indices.data(), - slots_[slot].sparseNonValueData.data()); - } else { - LOG(FATAL) << "Not Supported"; - } - size_t numElements = 0; - for (auto pos : dataPos) { - numElements += - slots_[slot].indices[pos + 1] - slots_[slot].indices[pos]; - } - nnzStats_[slot]->addSample(numElements); - - break; - } - case SlotDef::VECTOR_SPARSE_VALUE: { - if (!(cpuArguments[slot].value)) { - cpuArguments[slot].value = - Matrix::createSparseMatrix(size, - dim, - size /*DEFAULT_AVG_WIDTH = 1*/, - FLOAT_VALUE, - SPARSE_CSR, - false, - useGpu_); - } - auto mat = cpuArguments[slot].value; - mat->resize(size, dim); - if (std::dynamic_pointer_cast(mat)) { - std::dynamic_pointer_cast(mat)->copyFrom( - dataPos.data(), - slots_[slot].indices.data(), - slots_[slot].sparseFloatValueData.data(), - HPPL_STREAM_1); - } else if (std::dynamic_pointer_cast(mat)) { - std::dynamic_pointer_cast(mat)->copyFrom( - dataPos.data(), - slots_[slot].indices.data(), - slots_[slot].sparseFloatValueData.data()); - } else { - LOG(FATAL) << "Not Supported"; - } - break; - } - case SlotDef::INDEX: { - IVector::resizeOrCreate(cpuArguments[slot].ids, - size, - /* useGpu= */ false); - int* buf = cpuArguments[slot].ids->getData(); - for (int i = 0; i < size; ++i) { - buf[i] = slots_[slot].indexData[dataPos[i]]; - } - break; - } - case SlotDef::VAR_MDIM_DENSE: { - CHECK_EQ(size, 1); - auto mat = cpuArguments[slot].value; - size_t totalDim = slots_[slot].varDenseData[dataPos[0]].data.size(); - - CHECK_EQ(slots_[slot].varDenseData[dataPos[0]].dims.size(), size_t(3)); - size_t height, width, depth, oldWidth; - /* dims[2] is depth, will be changed to dims[0] in future */ - depth = slots_[slot].varDenseData[dataPos[0]].dims[2]; - height = slots_[slot].varDenseData[dataPos[0]].dims[1]; - width = slots_[slot].varDenseData[dataPos[0]].dims[0]; - oldWidth = width; - /* process the undesirable sample */ - if (oldWidth < height) { - width = height; - } - cpuArguments[slot].setFrameHeight(height); - cpuArguments[slot].setFrameWidth(width); - - if (oldWidth < height) { - totalDim = width * height * depth; - } - Matrix::resizeOrCreate(cpuArguments[slot].value, - size, - totalDim, - false, // trans = false - false); // useGpu = false - real* buf = cpuArguments[slot].value->getData(); - cpuArguments[slot].value->zeroMem(); - if (oldWidth < height) { - real* srcBuf = slots_[slot].varDenseData[dataPos[0]].data.data(); - for (size_t i = 0; i < depth; i++) { - for (size_t j = 0; j < height; j++) { - for (size_t k = 0; k < oldWidth; k++) { - buf[i * height * width + j * width + k] = - srcBuf[i * height * oldWidth + j * oldWidth + k]; - } - } - } - } else { - memcpy(buf, - slots_[slot].varDenseData[dataPos[0]].data.data(), - sizeof(real) * totalDim); - } - ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions, - size + 1, /* size == 1 currently */ - /* useGpu= */ false); - int* bufStarts = - cpuArguments[slot].sequenceStartPositions->getMutableData(false); - bufStarts[0] = 0; - bufStarts[1] = 1; - break; - } - case SlotDef::VAR_MDIM_INDEX: { - CHECK_EQ(size, 1); - size_t totalDim = slots_[slot].varIndices[dataPos[0]].size(); - IVector::resizeOrCreate(cpuArguments[slot].ids, - totalDim, - /* useGpu= */ false); - int* buf = cpuArguments[slot].ids->getData(); - memcpy(buf, - slots_[slot].varIndices[dataPos[0]].data(), - sizeof(int) * totalDim); - - ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions, - size + 1, /* size == 1 currently */ - /* useGpu= */ false); - int* bufStarts = - cpuArguments[slot].sequenceStartPositions->getMutableData(false); - bufStarts[0] = 0; - /* we expand the convolutinal feature map to a sequence data, - * so there should be a corresponding sequence labels */ - bufStarts[1] = totalDim; - break; - } - case SlotDef::STRING: { - if (cpuArguments[slot].strs) { - cpuArguments[slot].strs->resize(size); - } else { - cpuArguments[slot].strs = - std::make_shared>(size); - } - for (int i = 0; i < size; ++i) { - (*cpuArguments[slot].strs)[i] = slots_[slot].strData[dataPos[i]]; - } - break; - } - } - } - - if (useGpu_) { - std::vector& cpuArguments = cpuBatch.getStreams(); - DataBatch& gpuBatch = *gpuBatch_; - std::vector& gpuArguments = gpuBatch.getStreams(); - gpuArguments.resize(cpuArguments.size()); - gpuBatch.setSize(size); - for (int i = 0; i < header_.slot_defs_size(); ++i) { - SlotDef::SlotType slotType = header_.slot_defs(i).type(); - if (SlotDef::VECTOR_SPARSE_VALUE == slotType || - SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) { - gpuArguments[i] = cpuArguments[i]; - gpuArguments[i].sequenceStartPositions = - cpuArguments[i].sequenceStartPositions; - } else { - gpuArguments[i].resizeAndCopyFrom( - cpuArguments[i], useGpu_, HPPL_STREAM_1); - } - } - hl_stream_synchronize(HPPL_STREAM_1); - *batch = gpuBatch; - } else { - *batch = cpuBatch; - } - - currentSequenceIndex_ += numScannedSeqs; - - return batch->getSize(); -} - -ProtoSequenceDataProvider::ProtoSequenceDataProvider(const DataConfig& config, - bool useGpu, - bool loadDataAll) - : ProtoDataProvider(config, useGpu, loadDataAll) {} - -int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size, - DataBatch* batch) { - CHECK(iidData()) << "ProtoSequenceDataProvider only accepts iid data"; - int64_t numSequences = 0; // actual number of sequences in the batch - - // the number of sequences scanned, including those skipped because too long - int64_t numScannedSeqs = 0; - std::lock_guard guard(lock_); - size = std::min(getSize() - currentSequenceIndex_, size); - numScannedSeqs = numSequences = size; - if (size <= 0) return 0; - - DataBatch& cpuBatch = *cpuBatch_; - std::vector& cpuArguments = cpuBatch.getStreams(); - cpuBatch.setSize(size); - cpuArguments.resize(header_.slot_defs_size()); - - for (int slot = 0; slot < header_.slot_defs_size(); ++slot) { - SlotDef::SlotType slotType = header_.slot_defs(slot).type(); - - std::vector dataPos; - dataPos.reserve(size); - auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); }; - sampleLoop(op, size); - - // current slot: sequenceStartPositions - ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions, - size + 1, - /* useGpu= */ false); - - switch (slotType) { - case SlotDef::VECTOR_SPARSE_VALUE: - case SlotDef::VAR_MDIM_DENSE: - case SlotDef::VAR_MDIM_INDEX: { - LOG(FATAL) << "ProtoSequenceDataProvider only support" - << " VECTOR_DENSE, VECTOR_SPARSE_NON_VALUE and INDEX slots"; - break; - } - case SlotDef::VECTOR_SPARSE_NON_VALUE: { - // copy to IDS, not value - // pointers used in current slot - sparse_non_value_t* data = slots_[slot].sparseNonValueData.data(); - int64_t* indexs = slots_[slot].indices.data(); - int64_t* seqs = dataPos.data(); - - // current slot: i need size instances. what is the total length? - int totalFeatureInCurrentSlot = 0; - for (int ins = 0; ins < size; ins++) { - int64_t currInsId = seqs[ins]; - totalFeatureInCurrentSlot += - indexs[currInsId + 1] - indexs[currInsId]; - // special: if current instance has NO feature in current slot - if (indexs[currInsId + 1] == indexs[currInsId]) { - totalFeatureInCurrentSlot++; - } - } - // done - - // current slot: ids - IVector::resizeOrCreate(cpuArguments[slot].ids, - totalFeatureInCurrentSlot, - /* useGpu= */ false); - - // where to write - int* currPosOfArgumentId = cpuArguments[slot].ids->getData(); - int* currPosOfArgumentSeqStart = - cpuArguments[slot].sequenceStartPositions->getMutableData(false); - int allSequenceLength = 0; - currPosOfArgumentSeqStart[0] = 0; - // for each instance, copy data and fill sequence positions - for (int instance = 0; instance < size; instance++) { - int64_t currInstanceId = seqs[instance]; - int64_t currInstanceLength = - indexs[currInstanceId + 1] - indexs[currInstanceId]; - sparse_non_value_t* currInstanceData = data + indexs[currInstanceId]; - // write sequenceStartPositions - allSequenceLength += currInstanceLength; - currPosOfArgumentSeqStart[instance + 1] = allSequenceLength; - // copy features - for (int featCopier = 0; featCopier < currInstanceLength; - featCopier++) { - currPosOfArgumentId[featCopier] = currInstanceData[featCopier].col; - } - currPosOfArgumentId += currInstanceLength; - // special: if current instance has NO feature in current slot - if (currInstanceLength == 0) { - allSequenceLength++; - currPosOfArgumentSeqStart[instance + 1] = allSequenceLength; - currPosOfArgumentId[0] = -1; - currPosOfArgumentId++; - } - // done - } - if (slots_[slot].subIndices.size()) { - std::vector dataSubPos; - auto op = [this, &dataSubPos](int64_t pos) { - dataSubPos.push_back(pos); - }; - int subSize = subSampleLoop(op, size, slot); - ICpuGpuVector::resizeOrCreate( - cpuArguments[slot].subSequenceStartPositions, subSize + 1, false); - int* currPosOfArgumentSubSeqStart = - cpuArguments[slot].subSequenceStartPositions->getMutableData( - false); - int64_t* subSeqs = dataSubPos.data(); - int64_t* subIndexs = slots_[slot].subIndices.data(); - int allSubSequenceLength = 0; - currPosOfArgumentSubSeqStart[0] = 0; - // for each instance, compute sub-sequence number - for (int instance = 0; instance < subSize; instance++) { - int64_t currSubInstanceId = subSeqs[instance]; - int64_t currSubInstanceLength = - subIndexs[currSubInstanceId + 1] - subIndexs[currSubInstanceId]; - // write subSequenceStartPositions - allSubSequenceLength += currSubInstanceLength; - currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength; - // special: if current instance has NO feature in current slot - if (currSubInstanceLength == 0) { - allSubSequenceLength++; - currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength; - } - } - cpuArguments[slot].checkSubset(); - } - break; - } - case SlotDef::INDEX: { - // label slot - IVector::resizeOrCreate(cpuArguments[slot].ids, - size, - /* useGpu= */ false); - // fill labels - int* buf = cpuArguments[slot].ids->getData(); - for (int i = 0; i < size; ++i) { - buf[i] = slots_[slot].indexData[dataPos[i]]; - } - // label HAS sequence structure - cpuArguments[slot].sequenceStartPositions->fillSequence(false); - break; - } - case SlotDef::VECTOR_DENSE: { - // copy values - size_t dim = header_.slot_defs(slot).dim(); - Matrix::resizeOrCreate(cpuArguments[slot].value, - size, - dim, - false, // trans = false - false); // useGpu = false - real* buf = cpuArguments[slot].value->getData(); - for (int i = 0; i < size; ++i) { - memcpy(buf + i * dim, - slots_[slot].denseData.data() + dataPos[i] * dim, - sizeof(real) * dim); - } - // sequence structure - cpuArguments[slot].sequenceStartPositions->fillSequence(false); - break; - } - default: { LOG(FATAL) << "should not reach here"; } - } - } - - if (useGpu_) { - std::vector& cpuArguments = cpuBatch.getStreams(); - DataBatch& gpuBatch = *gpuBatch_; - std::vector& gpuArguments = gpuBatch.getStreams(); - gpuArguments.resize(cpuArguments.size()); - gpuBatch.setSize(size); - for (size_t i = 0; i < cpuArguments.size(); ++i) { - gpuArguments[i].resizeAndCopyFrom( - cpuArguments[i], useGpu_, HPPL_STREAM_1); - } - hl_stream_synchronize(HPPL_STREAM_1); - *batch = gpuBatch; - } else { - *batch = cpuBatch; - } - - currentSequenceIndex_ += numScannedSeqs; - return batch->getSize(); -} - -} // namespace paddle diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h deleted file mode 100644 index 7dd45e062248f20d24c633dd4e1c8b7eebcbfa1b..0000000000000000000000000000000000000000 --- a/paddle/gserver/dataproviders/ProtoDataProvider.h +++ /dev/null @@ -1,179 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "DataFormat.pb.h" -#include "paddle/utils/Stat.h" - -#include "DataProvider.h" -#include "ProtoReader.h" - -namespace paddle { - -/** - * @brief Provider data from protobuf data file with each sample - * specified by proto message - * - * DataSample defined in DataFormat.proto. - * - * The file format is - * - * header - * - * sample1 - * - * sample2 - * - * ... - * - * sampleN - * - * @note: In the data file, each message is prefixed with its length. - * The read/write of the protbuf are implemented in ProtoReader.h - */ -class ProtoDataProvider : public DataProvider { -public: - ProtoDataProvider(const DataConfig& config, - bool useGpu, - bool loadDataAll = true); - virtual void reset(); - - /** - * @note this size includes the sequences which are skipped because they - * are longer than the batch size. - */ - virtual int64_t getSize() { - int64_t size = sampleNums_; - if (usageRatio_ < 1.0f) { - size = static_cast(size * usageRatio_); - } - return size; - } - virtual void shuffle(); - - void loadData(const std::vector& fileList); - - virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch); - -protected: - /** - * @brief load protobuf data from a list of file - * @param[in] fileName file name of a file which contains - * a list of file names - */ - void loadData(const std::string& fileName); - - /** - * @brief load protobuf data from file - * @param[in] fileName data file name - */ - void loadDataFile(const std::string& fileName); - /** @brief check data header of each data sample - * @param[in] header data header read from protobuf data - */ - void checkDataHeader(const DataHeader& header); - /** - * @brief fill protobuf data into slot_, - * slot_ is a vector of ProtoSlot in memory. - * @param[in] sample data sample read from protobuf data - */ - void fillSlots(const DataSample& sample); - - /** - * @brief return true if each sample is one sequence, i.e., independent - * of other samples. - */ - inline bool iidData() const { return sequenceStartPositions_.empty(); } - - /** - * @brief check that sample is consistent with header_ - */ - void checkSample(const DataSample& sample); - - template - int64_t sequenceLoop(Op op, int64_t size); - - template - int64_t sampleLoop(Op op, int64_t size); - - template - int64_t subSampleLoop(Op op, int64_t size, int slot); - - void showDataStats(); - -protected: - struct ProtoVarSlot { - std::vector data; - std::vector dims; - }; - - struct ProtoSlot { - SlotDef::SlotType type; - int dim; - std::vector indexData; - std::vector denseData; - std::vector sparseNonValueData; - std::vector sparseFloatValueData; - std::vector indices; - std::vector subIndices; - - std::vector varDenseData; - std::vector> varIndices; - std::vector strData; - }; - DataHeader header_; - int numVecSlots_; - - std::vector slots_; - size_t sampleNums_; - - /** - * The starting position of each sequence in samples. - * The last element should be num of samples. - * If empty, each sample is one sequence. - */ - std::vector sequenceStartPositions_; - - int64_t currentSequenceIndex_; - - // The size should be the number of sequences. - std::vector shuffledSequenceIds_; - - ThreadLocalD cpuBatch_; - ThreadLocalD gpuBatch_; - - RWLock lock_; - std::vector nnzStats_; // stats for number of none-zeros entries -}; - -/** - * @brief Special use for Proto data: instances should contain sparse-non-value - * slots - * and label. - * - * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE - */ -class ProtoSequenceDataProvider : public ProtoDataProvider { -public: - ProtoSequenceDataProvider(const DataConfig& config, - bool useGpu, - bool loadDataAll = true); - ~ProtoSequenceDataProvider() {} - virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch); -}; - -} // namespace paddle diff --git a/paddle/gserver/layers/DotProdLayer.cpp b/paddle/gserver/layers/DotProdLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9e2dbe3c3c416f606d2938701f26288642b55267 --- /dev/null +++ b/paddle/gserver/layers/DotProdLayer.cpp @@ -0,0 +1,97 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Layer.h" +#include "paddle/math/Matrix.h" +#include "paddle/utils/Logging.h" +#include "paddle/utils/Stat.h" + +namespace paddle { + +/** + * @brief A layer for computing the dot product of two vectors. + * Input1: vector (batchSize * dim) + * Input2: vector (batchSize * dim) + * Output: a matrix: (batchSize * 1) + */ + +class DotProdLayer : public Layer { +public: + explicit DotProdLayer(const LayerConfig& config) : Layer(config) {} + + ~DotProdLayer() {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + void backward(const UpdateCallback& callback = nullptr) override; +}; + +REGISTER_LAYER(dot_prod, DotProdLayer); + +bool DotProdLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + + CHECK_EQ(inputLayers_.size(), 2U); + CHECK_EQ(1UL, getSize()) + << "The output dimensionality of this layer should be fixed to 1."; + + return true; +} + +void DotProdLayer::forward(PassType passType) { + Layer::forward(passType); + + MatrixPtr inV0 = getInputValue(0); + MatrixPtr inV1 = getInputValue(1); + + size_t batchSize = inV0->getHeight(); + CHECK_EQ(inV1->getHeight(), batchSize); + CHECK_EQ(inV0->getWidth(), inV1->getWidth()); + + { + REGISTER_TIMER_INFO("FwResetTimer", getName().c_str()); + reserveOutput(batchSize, 1); + } + + MatrixPtr outV = getOutputValue(); + { + REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str()); + outV->sumOfProducts(*inV0, *inV1, 1, 0); + } +} + +void DotProdLayer::backward(const UpdateCallback& callback) { + MatrixPtr inV0 = getInputValue(0); + MatrixPtr inV1 = getInputValue(1); + MatrixPtr outG = getOutputGrad(); + MatrixPtr inG0 = getInputGrad(0); + MatrixPtr inG1 = getInputGrad(1); + + { + REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str()); + + if (inG0) { + inG0->addRowScale(0, *inV1, *outG); + } + + if (inG1) { + inG1->addRowScale(0, *inV0, *outG); + } + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.cpp b/paddle/gserver/layers/MKLDNNConcatLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c9099297cc5c741fbae0b42f21b988e6c561ef11 --- /dev/null +++ b/paddle/gserver/layers/MKLDNNConcatLayer.cpp @@ -0,0 +1,202 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "MKLDNNConcatLayer.h" + +using namespace mkldnn; // NOLINT +typedef memory::format format; + +namespace paddle { + +REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer); + +bool MKLDNNConcatLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + if (!MKLDNNLayer::init(layerMap, parameterMap)) { + return false; + } + CHECK_GT(inputLayers_.size(), 1UL); + CHECK(!biasParameter_); + return true; +} + +void MKLDNNConcatLayer::reshape( + int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) { + reshapeInput(bs, ih, iw); + ic = inputLayers_[0]->getSize() / ih / iw; + CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize()); + CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw); + CHECK_GT(inputLayers_.size(), 1UL); + channels_.resize(inputLayers_.size()); + channels_[0] = ic; + // need change the output channel, so use oc_ instead + // TODO(TJ): change API, use &oc + oc_ = ic; + for (size_t i = 1; i < inputLayers_.size(); i++) { + int batchsize, height, witdh; + reshapeInput(batchsize, height, witdh, i); + CHECK_EQ(bs, batchsize); + CHECK_EQ(ih, height); + CHECK_EQ(iw, witdh); + + channels_[i] = inputLayers_[i]->getSize() / height / witdh; + CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize()); + oc_ += channels_[i]; + } + oh = ih; + ow = iw; + reshapeOutput(oh, ow); + resizeOutput(bs, oc_ * oh * ow); +} + +void MKLDNNConcatLayer::resetFwd(std::vector& pipeline, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& wgt, + MKLDNNMatrixPtr& bias, + MKLDNNMatrixPtr& out) { + resetFwdBuffers(inVals_, out); + in = inVals_[0]; + + std::shared_ptr fwdPD; + resetFwdPD(fwdPD, inVals_, out); + + resetFwdPipeline(pipeline, fwdPD, inVals_, out); +} + +void MKLDNNConcatLayer::resetBwd(std::vector& pipeline, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& wgt, + MKLDNNMatrixPtr& bias, + MKLDNNMatrixPtr& out) { + resetBwdBuffers(inGrads_, out); + in = inGrads_[0]; + + resetBwdPipeline(pipeline, bwds_, inGrads_, out); +} + +void MKLDNNConcatLayer::resetFwdBuffers(std::vector& inputs, + MKLDNNMatrixPtr& out) { + inputs.resize(inputLayers_.size()); + bool has8c = false, has16c = false, hasnc = false; + for (size_t i = 0; i < inputs.size(); i++) { + // resetInValue will use ic_ so temporary change as current input's channel + // TODO(TJ): change ic_ as vector then can remove channels_ + ic_ = channels_[i]; + resetInValue(inputs[i], nullptr, i); + CHECK(inputs[i]); + auto dm = inputs[i]->getDims(); + // inputs format can be different, but ndims must equal + CHECK(i == 0 || dm.size() == inputs[0]->getDims().size()); + CHECK_EQ(bs_, dm[0]); + CHECK_EQ(channels_[i], dm[1]); + if (dm.size() > 2) { + CHECK_EQ(ih_, dm[2]); + CHECK_EQ(iw_, dm[3]); + } + if (inputs[i]->getFormat() == format::nc) { + hasnc = true; + } + if (inputs[i]->getFormat() == format::nChw8c) { + has8c = true; + } + if (inputs[i]->getFormat() == format::nChw16c) { + has16c = true; + } + } + // change back, ic_ always save the input 0 size + ic_ = channels_[0]; + + format outFmt; + if (has16c && oc_ % 16 == 0) { + outFmt = format::nChw16c; + } else if (has8c && oc_ % 8 == 0) { + outFmt = format::nChw8c; + } else if (hasnc) { + CHECK(oh_ == 1 && ow_ == 1); + outFmt = format::nc; + } else { + outFmt = format::nchw; + } + memory::dims outDims = + hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_}; + auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_); + resetOutValue(out, outPD); +} + +void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr& pd, + std::vector& inputs, + MKLDNNMatrixPtr out) { + std::vector srcPDs; + for (size_t i = 0; i < inputs.size(); i++) { + srcPDs.push_back(inputs[i]->getPrimitiveDesc()); + } + CHECK(out); + pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs)); + CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc()); +} + +void MKLDNNConcatLayer::resetFwdPipeline( + std::vector& pipeline, + std::shared_ptr& pd, + std::vector& inputs, + MKLDNNMatrixPtr& out) { + std::vector srcs; + for (size_t i = 0; i < inputs.size(); i++) { + srcs.push_back(*(inputs[i])); + } + fwd_.reset(new concat(*pd, srcs, *out)); + pipeline.push_back(*fwd_); +} + +void MKLDNNConcatLayer::resetBwdBuffers(std::vector& inputs, + MKLDNNMatrixPtr& out) { + CHECK(outVal_); + resetOutGrad(out, outVal_->getPrimitiveDesc()); + CHECK(out); + + inputs.resize(inputLayers_.size()); + for (size_t i = 0; i < inputs.size(); i++) { + CHECK(inVals_[i]); + // resetInGrad will use inVal_ + // TODO(TJ): change move inVals_ to MKLDNNLayer ans remove inVal_ + inVal_ = inVals_[i]; + resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i); + CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc()); + } + // change back, inVal_ always save the input 0 + inVal_ = inVals_[0]; +} + +void MKLDNNConcatLayer::resetBwdPipeline( + std::vector& pipeline, + std::vector>& prims, + std::vector& inputs, + MKLDNNMatrixPtr& out) { + // reset the backward primitives + memory::dims offsets = {0, 0, 0, 0}; + prims.resize(inputs.size()); + CHECK_EQ(inputs.size(), channels_.size()); + for (size_t i = 0; i < inputs.size(); i++) { + auto viewPD = view::primitive_desc( + out->getPrimitiveDesc(), inputs[i]->getDims(), offsets); + auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(), + inputs[i]->getPrimitiveDesc()); + prims[i].reset(new reorder(bwdPD, *out, *(inputs[i]))); + offsets[axis_] += channels_[i]; + // push to pipeline + pipeline.push_back(*prims[i]); + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.h b/paddle/gserver/layers/MKLDNNConcatLayer.h new file mode 100644 index 0000000000000000000000000000000000000000..d5749d327e4259b81541a234f48a4538ab035fe4 --- /dev/null +++ b/paddle/gserver/layers/MKLDNNConcatLayer.h @@ -0,0 +1,129 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "MKLDNNLayer.h" +#include "mkldnn.hpp" + +namespace paddle { + +/** + * @brief A subclass of MKLDNNLayer Concatenate layer. + * + * The config file api is mkldnn_concat + */ +class MKLDNNConcatLayer : public MKLDNNLayer { +protected: + std::vector inVals_; + std::vector inGrads_; + std::vector> bwds_; + // input channel numbers + std::vector channels_; + + // concat_dimension in MKLDNN + // if axis_ == 0, concat batchsize + // if axis_ == 1, concat channel (default) + int axis_; + +public: + explicit MKLDNNConcatLayer(const LayerConfig& config) + : MKLDNNLayer(config), axis_(1) {} + + ~MKLDNNConcatLayer() {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void reshape( + int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override; + + void resetFwd(std::vector& pipeline, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& wgt, + MKLDNNMatrixPtr& bias, + MKLDNNMatrixPtr& out) override; + + void resetBwd(std::vector& pipeline, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& wgt, + MKLDNNMatrixPtr& bias, + MKLDNNMatrixPtr& out) override; + + void printSizeInfo() override { + CHECK_EQ(channels_.size(), inputLayers_.size()); + for (size_t i = 0; i < channels_.size(); ++i) { + VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName() + << ": " << bs_ << ", " << channels_[i] << ", " << ih_ + << ", " << iw_; + } + VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_ + << ", " << ow_; + } + + void printValueFormat() override { + for (size_t i = 0; i < inVals_.size(); ++i) { + VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName() + << ": " << inVals_[i]->getFormat() << " >>>"; + } + if (outVal_) { + VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "; + } + if (extOutVal_) { + VLOG(MKLDNN_FMTS) << extOutVal_->getFormat(); + } + } + + void printGradFormat() override { + if (extOutGrad_) { + VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat(); + } + if (outGrad_) { + VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "; + } + for (size_t i = 0; i < inGrads_.size(); ++i) { + VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName() + << ": " << inGrads_[i]->getFormat() << "<<<"; + } + } + +protected: + /** + * Forward functions: reset buffers(inputs, output, bias), + * reset primitive descriptor, + * reset pipeline. + */ + void resetFwdBuffers(std::vector& inputs, + MKLDNNMatrixPtr& out); + void resetFwdPD(std::shared_ptr& pd, + std::vector& inputs, + MKLDNNMatrixPtr out); + void resetFwdPipeline(std::vector& pipeline, + std::shared_ptr& pd, + std::vector& inputs, + MKLDNNMatrixPtr& out); + + /** + * Backward functions: reset buffers(inputs, output, bias) + * reset primitives and pipeline + */ + void resetBwdBuffers(std::vector& inputs, + MKLDNNMatrixPtr& out); + void resetBwdPipeline(std::vector& pipeline, + std::vector>& prims, + std::vector& inputs, + MKLDNNMatrixPtr& out); +}; + +} // namespace paddle diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp index e75ac5ba4647a8267b7bc189893bd7adb5c3053f..cf42da0735282d667d6b87061c8c59bf2f96e0be 100644 --- a/paddle/gserver/layers/MKLDNNLayer.cpp +++ b/paddle/gserver/layers/MKLDNNLayer.cpp @@ -21,8 +21,8 @@ namespace paddle { bool MKLDNNLayer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { - CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn." - << "Please set WITH_MKLDNN=ON " + CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn." + << "Please set WITH_MKL=ON " << "and set use_mkldnn=True"; CHECK(!useGpu_) << "Do not support GPU yet"; @@ -138,8 +138,11 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) { } } -void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) { - const Argument& input = inputLayers_[0]->getOutput(); +void MKLDNNLayer::reshapeInput(int& batchsize, + int& height, + int& width, + size_t inputIdx) { + const Argument& input = inputLayers_[inputIdx]->getOutput(); batchsize = input.getBatchSize(); int h = input.getFrameHeight(); int w = input.getFrameWidth(); diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index 7479c34c92b5231b2521493bc631474d4efd4224..4c42df1bee75fa7b28c2001c30797cc0df7c5554 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -178,7 +178,10 @@ protected: /** * reshape the input image sizes and input batchsize */ - void reshapeInput(int& batchsize, int& height, int& width); + void reshapeInput(int& batchsize, + int& height, + int& width, + size_t inputIdx = 0); /** * reshape output image sizes diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index 4bea348f637f39444e8aad89278e6366ecd73b1d..c295ea19c9ccb3d05c509a41925d2c36efdba8ef 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -29,7 +29,7 @@ gserver_test(test_KmaxSeqScore) gserver_test(test_Expand) gserver_test(test_MaxPoolingWithMaskOutput) -########## test_Mkldnn layers and activations ########## +########## test_MKLDNN layers and activations ########## if(WITH_MKLDNN) add_unittest_without_exec(test_MKLDNN test_MKLDNN.cpp @@ -62,17 +62,6 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE) endif() if(NOT MOBILE_INFERENCE) -################### test_ProtoDataProvider ############ - add_unittest_without_exec(test_ProtoDataProvider - test_ProtoDataProvider.cpp) - - # test_ProtoDataProvider will mkdir as same name, - # so if WORKING_DIRECTORY is default directory, then - # mkdir will get error. - add_test(NAME test_ProtoDataProvider - COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) - ################## test_Evaluator ####################### add_unittest(test_Evaluator test_Evaluator.cpp) @@ -110,3 +99,24 @@ add_test(NAME test_PyDataProvider2 COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2 WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle ) + +################# test_CompareSparse ################## +add_unittest_without_exec(test_CompareSparse + test_CompareSparse.cpp) +if(NOT ON_TRAVIS) + add_test(NAME test_CompareSparse + COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d + ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests + ./.set_port.sh -p port -n 6 + ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) +endif() + +################ test_CompareTwoNets ###################### +add_unittest_without_exec(test_CompareTwoNets + test_CompareTwoNets.cpp) +add_test(NAME test_CompareTwoNets + COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d + ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests + ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h index ca55a45bc77b4e171619ab788d7c7dfeefcd036a..9d61533c0b6f20c41130d7b7c15ad93392b2d24c 100644 --- a/paddle/gserver/tests/MKLDNNTester.h +++ b/paddle/gserver/tests/MKLDNNTester.h @@ -23,7 +23,7 @@ limitations under the License. */ namespace paddle { /** - * @brief test the functionality of Mkldnnlayers + * @brief test the functionality of MKLDNNlayers and MKLDNNActivations * refer to paddle original function */ class MKLDNNTester { diff --git a/paddle/gserver/tests/proto_files.txt b/paddle/gserver/tests/proto_files.txt deleted file mode 100644 index 691b38c7940bd21360eb00384e060554aa4b3e22..0000000000000000000000000000000000000000 --- a/paddle/gserver/tests/proto_files.txt +++ /dev/null @@ -1,2 +0,0 @@ -./test_ProtoDataProvider/data1.bin -./test_ProtoDataProvider/data2.bin diff --git a/paddle/gserver/tests/proto_files_compressed.txt b/paddle/gserver/tests/proto_files_compressed.txt deleted file mode 100644 index 7413c81e185d02e0d03aefa06480b9722357c5eb..0000000000000000000000000000000000000000 --- a/paddle/gserver/tests/proto_files_compressed.txt +++ /dev/null @@ -1,2 +0,0 @@ -./test_ProtoDataProvider/data1.bin.gz -./test_ProtoDataProvider/data2.bin.gz diff --git a/paddle/gserver/tests/sequence_lstm.conf b/paddle/gserver/tests/sequence_lstm.conf new file mode 100644 index 0000000000000000000000000000000000000000..f49a827f22edce056eaf9903e99b732cab7f3784 --- /dev/null +++ b/paddle/gserver/tests/sequence_lstm.conf @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +######################## data source ################################ +dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict' +dict_file = dict() +for line_count, line in enumerate(open(dict_path, "r")): + dict_file[line.strip()] = line_count + +define_py_data_sources2( + train_list='gserver/tests/Sequence/train.list', + test_list=None, + module='sequenceGen', + obj='process', + args={"dict_file": dict_file}) + +settings(batch_size=5) +######################## network configure ################################ +dict_dim = len(open(dict_path, 'r').readlines()) +word_dim = 128 +hidden_dim = 256 +label_dim = 3 +sparse_update = get_config_arg("sparse_update", bool, False) + +data = data_layer(name="word", size=dict_dim) + +emb = embedding_layer( + input=data, + size=word_dim, + param_attr=ParamAttr(sparse_update=sparse_update)) + +with mixed_layer(size=hidden_dim * 4) as lstm_input: + lstm_input += full_matrix_projection(input=emb) + +lstm = lstmemory( + input=lstm_input, + act=TanhActivation(), + gate_act=SigmoidActivation(), + state_act=TanhActivation()) + +lstm_last = last_seq(input=lstm) + +with mixed_layer( + size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output: + output += full_matrix_projection(input=lstm_last) + +outputs( + classification_cost( + input=output, label=data_layer( + name="label", size=1))) diff --git a/paddle/gserver/tests/sequence_recurrent.py b/paddle/gserver/tests/sequence_recurrent.py new file mode 100644 index 0000000000000000000000000000000000000000..4895df186bfecc5cb5263676a9cd5bac5039d565 --- /dev/null +++ b/paddle/gserver/tests/sequence_recurrent.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +######################## data source ################################ +dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict' +dict_file = dict() +for line_count, line in enumerate(open(dict_path, "r")): + dict_file[line.strip()] = line_count + +define_py_data_sources2( + train_list='gserver/tests/Sequence/train.list', + test_list=None, + module='sequenceGen', + obj='process', + args={"dict_file": dict_file}) + +settings(batch_size=5) +######################## network configure ################################ +dict_dim = len(open(dict_path, 'r').readlines()) +word_dim = 128 +hidden_dim = 128 +label_dim = 3 + +# This config is designed to be equivalent with sequence_recurrent_group.py + +data = data_layer(name="word", size=dict_dim) + +emb = embedding_layer( + input=data, size=word_dim, param_attr=ParamAttr(name="emb")) + +recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation()) + +recurrent_last = last_seq(input=recurrent) + +with mixed_layer( + size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output: + output += full_matrix_projection(input=recurrent_last) + +outputs( + classification_cost( + input=output, label=data_layer( + name="label", size=1))) diff --git a/paddle/gserver/tests/sequence_recurrent_group.py b/paddle/gserver/tests/sequence_recurrent_group.py new file mode 100644 index 0000000000000000000000000000000000000000..a1d54542e3bc4e89f70d31d5e89c0f44953c9f90 --- /dev/null +++ b/paddle/gserver/tests/sequence_recurrent_group.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +######################## data source ################################ +dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict' +dict_file = dict() +for line_count, line in enumerate(open(dict_path, "r")): + dict_file[line.strip()] = line_count + +define_py_data_sources2( + train_list='gserver/tests/Sequence/train.list', + test_list=None, + module='sequenceGen', + obj='process', + args={"dict_file": dict_file}) + +settings(batch_size=5) +######################## network configure ################################ +dict_dim = len(open(dict_path, 'r').readlines()) +word_dim = 128 +hidden_dim = 128 +label_dim = 3 + +# This config is designed to be equivalent with sequence_recurrent.py + +data = data_layer(name="word", size=dict_dim) + +emb = embedding_layer( + input=data, size=word_dim, param_attr=ParamAttr(name="emb")) + + +def step(y): + mem = memory(name="rnn_state", size=hidden_dim) + with mixed_layer( + name="rnn_state", + size=hidden_dim, + bias_attr=False, + act=SoftmaxActivation()) as out: + out += identity_projection(input=y) + out += full_matrix_projection( + input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__")) + return out + + +recurrent = recurrent_group(name="rnn", step=step, input=emb) + +recurrent_last = last_seq(input=recurrent) + +with mixed_layer( + size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output: + output += full_matrix_projection(input=recurrent_last) + +outputs( + classification_cost( + input=output, label=data_layer( + name="label", size=1))) diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/gserver/tests/test_CompareSparse.cpp similarity index 98% rename from paddle/trainer/tests/test_CompareSparse.cpp rename to paddle/gserver/tests/test_CompareSparse.cpp index 5f1834bd730375fc10762fc19788d0c693f8e752..c6e07650fc4805a25baf38b9059f6c996d00cafc 100644 --- a/paddle/trainer/tests/test_CompareSparse.cpp +++ b/paddle/gserver/tests/test_CompareSparse.cpp @@ -22,8 +22,7 @@ limitations under the License. */ using namespace paddle; // NOLINT using namespace std; // NOLINT -static const string& configFile1 = - "trainer/tests/sample_trainer_config_compare_sparse.conf"; +static const string& configFile1 = "gserver/tests/sequence_lstm.conf"; DECLARE_bool(use_gpu); DECLARE_string(config); diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/gserver/tests/test_CompareTwoNets.cpp similarity index 95% rename from paddle/trainer/tests/test_CompareTwoNets.cpp rename to paddle/gserver/tests/test_CompareTwoNets.cpp index 94f65e545d116c802fb4877dc14f07aaaf83a4fb..801d9607565910b1f7f68a9c4532de5877e44f30 100644 --- a/paddle/trainer/tests/test_CompareTwoNets.cpp +++ b/paddle/gserver/tests/test_CompareTwoNets.cpp @@ -30,8 +30,6 @@ DECLARE_bool(use_gpu); DECLARE_string(config); DECLARE_string(nics); -DEFINE_string(config_file_a, "", "config of one network to compare"); -DEFINE_string(config_file_b, "", "config of another network to compare"); DEFINE_bool(need_high_accuracy, false, "whether need to run in double accuracy"); @@ -42,6 +40,10 @@ DEFINE_double( DECLARE_bool(thread_local_rand_use_global_seed); DECLARE_int32(seed); +static const string& config_file_a = "gserver/tests/sequence_recurrent.py"; +static const string& config_file_b = + "gserver/tests/sequence_recurrent_group.py"; + struct ComData { vector outArgs; vector parameters; @@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) { DataBatch dataBatch; int32_t batchSize = trainer.getConfig().opt_config().batch_size(); + trainer.getDataProvider()->reset(); trainer.getDataProvider()->setSkipShuffle(); trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch); @@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) { TEST(Trainer, create) { ComData dataA; - calcGradient(dataA, FLAGS_config_file_a); + calcGradient(dataA, config_file_a); LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n"; ComData dataB; - calcGradient(dataB, FLAGS_config_file_b); + calcGradient(dataB, config_file_b); LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n"; compareGradient(dataA, dataB); diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 3517d293e3c901caaa19952b04e56d1ef0d2b46e..fb4eea6f67da9078ef43268a3a1603dc6ccfa652 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -1081,6 +1081,21 @@ TEST(Layer, InterpolationLayer) { } } +TEST(Layer, DotProdLayer) { + TestConfig config; + config.layerConfig.set_type("dot_prod"); + config.layerConfig.set_size(1); + + config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0}); + config.layerConfig.add_inputs(); + config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0}); + config.layerConfig.add_inputs(); + + for (auto useGpu : {false, true}) { + testLayerGrad(config, "dot_prod", 10, false, useGpu); + } +} + TEST(Layer, OuterProdLayer) { TestConfig config; config.layerConfig.set_type("out_prod"); diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp index a859e34c8996d81f14bf1edcb6e23d5a4f687e6b..42644e9601a82ea81c417adc6441edeb036998e2 100644 --- a/paddle/gserver/tests/test_MKLDNN.cpp +++ b/paddle/gserver/tests/test_MKLDNN.cpp @@ -313,6 +313,47 @@ TEST(MKLDNNLayer, AddtoLayer) { testAddtoLayer({4, 12, 1, 1}, 3); } +static void getMKLDNNConcatConfig(TestConfig& cfg, + const std::vector& inputs) { + CHECK_GE(inputs.size(), 2) << "at least two inputs"; + int oc = inputs[0].ic; + for (size_t i = 1; i < inputs.size(); ++i) { + CHECK_EQ(inputs[i].bs, inputs[0].bs); + CHECK_EQ(inputs[i].ih, inputs[0].ih); + CHECK_EQ(inputs[i].iw, inputs[0].iw); + oc += inputs[i].ic; + } + cfg.biasSize = 0; + cfg.layerConfig.set_type("mkldnn_concat"); + cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw); + cfg.layerConfig.set_active_type("relu"); + for (size_t i = 0; i < inputs.size(); ++i) { + std::stringstream ss; + ss << "layer_" << i; + cfg.inputDefs.push_back( + {INPUT_DATA, + ss.str(), + (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw, + 0}); + LayerInputConfig* input = cfg.layerConfig.add_inputs(); + ImageConfig* img_conf = input->mutable_image_conf(); + img_conf->set_channels(inputs[i].ic); + img_conf->set_img_size_y(inputs[i].ih); + img_conf->set_img_size(inputs[i].iw); + } +} + +void testConcatLayer(const std::vector& inputs) { + TestConfig dnnConfig; + getMKLDNNConcatConfig(dnnConfig, inputs); + RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0]) +} + +TEST(MKLDNNLayer, ConcatLayer) { + testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}}); + testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}}); +} + void testActivation(std::string actType, const testImageDesc& pm) { // TODO(TJ): remove me when paddle support elu activation if (actType == "mkldnn_elu") { diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp deleted file mode 100644 index af6472619d1840e82787974d265d601b4a406c09..0000000000000000000000000000000000000000 --- a/paddle/gserver/tests/test_ProtoDataProvider.cpp +++ /dev/null @@ -1,732 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include - -#include "paddle/gserver/dataproviders/ProtoDataProvider.h" -#include "paddle/utils/Util.h" - -#include "paddle/testing/TestUtil.h" - -using namespace std; // NOLINT - -std::vector protoFiles{ - "./test_ProtoDataProvider/data1.bin", "./test_ProtoDataProvider/data2.bin", -}; -std::vector protoFilesCompressed{ - "./test_ProtoDataProvider/data1.bin.gz", - "./test_ProtoDataProvider/data2.bin.gz", -}; - -const char* kTestDir = "./test_ProtoDataProvider"; -const char kProtoFileList[] = "gserver/tests/proto_files.txt"; -const char kProtoFileListCompressed[] = - "gserver/tests/proto_files_compressed.txt"; -const int kSpraseMatrixDim = 1024; - -using namespace paddle; // NOLINT - -void prepareData(DataBatch* batch, - const int* numPerSlotType, - bool iid, - bool useGpu) { - batch->clear(); - int64_t size = uniformRandom(100) + 10; - batch->setSize(size); - - ICpuGpuVectorPtr sequenceStartPositions; - ICpuGpuVectorPtr subSequenceStartPositions; - if (!iid) { - int numSeqs = uniformRandom(10) + 1; - sequenceStartPositions = - ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false); - int* buf = sequenceStartPositions->getMutableData(false); - subSequenceStartPositions = - ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false); - int* subBuf = subSequenceStartPositions->getMutableData(false); - int64_t pos = 0; - int maxLen = 2 * size / numSeqs; - for (int i = 0; i < numSeqs; ++i) { - int len = - uniformRandom(min(maxLen, size - pos - numSeqs + i)) + 1; - buf[i] = pos; - subBuf[i] = pos; - pos += len; - VLOG(1) << " len=" << len; - } - buf[numSeqs] = size; - subBuf[numSeqs] = size; - } - - vector& arguments = batch->getStreams(); - for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_DENSE]; ++i) { - int64_t dim = rand() % 10 + 4; // NOLINT rand_r - MatrixPtr mat = Matrix::create(size, dim, /* trans= */ false, false); - mat->randomizeUniform(); - Argument arg; - arg.value = mat; - arg.sequenceStartPositions = sequenceStartPositions; - arguments.push_back(arg); - } - for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE]; ++i) { - MatrixPtr mat = - makeRandomSparseMatrix(size, kSpraseMatrixDim, false, useGpu); - Argument arg; - arg.value = mat; - arg.sequenceStartPositions = sequenceStartPositions; - arg.subSequenceStartPositions = subSequenceStartPositions; - arguments.push_back(arg); - } - for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE]; ++i) { - MatrixPtr mat = - makeRandomSparseMatrix(size, kSpraseMatrixDim, true, useGpu); - Argument arg; - arg.value = mat; - arg.sequenceStartPositions = sequenceStartPositions; - arguments.push_back(arg); - } - for (int i = 0; i < numPerSlotType[SlotDef::STRING]; ++i) { - int64_t dim = rand() % 10 + 4; // NOLINT rand_r - SVectorPtr vec = std::make_shared>(); - for (int j = 0; j < size; ++j) { - vec->push_back(randStr(dim)); - } - Argument arg; - arg.strs = vec; - arg.sequenceStartPositions = sequenceStartPositions; - arguments.push_back(arg); - } - for (int i = 0; i < numPerSlotType[SlotDef::INDEX]; ++i) { - int64_t dim = rand() % 10 + 4; // NOLINT rand_r - IVectorPtr vec = IVector::create(size, /* useGpu= */ false); - int* buf = vec->getData(); - for (int j = 0; j < size; ++j) { - buf[j] = uniformRandom(dim); - } - Argument arg; - arg.ids = vec; - arg.sequenceStartPositions = sequenceStartPositions; - arguments.push_back(arg); - } -} - -inline int getSlotDim(const Argument& arg) { - if (arg.value) { - return arg.value->getWidth(); - } else if (arg.ids) { - return arg.ids->getMax() + 1; - } else if (arg.strs) { - return 1; - } - LOG(FATAL) << "Invalid argument"; - return 0; -} - -inline SlotDef::SlotType getSlotType(const Argument& arg) { - if (arg.value) { - auto& m = *arg.value; - auto& type = typeid(m); - if (type == typeid(CpuMatrix) || type == typeid(GpuMatrix)) { - return SlotDef::VECTOR_DENSE; - } - if (type == typeid(CpuSparseMatrix)) { - auto valueType = - std::dynamic_pointer_cast(arg.value)->getValueType(); - if (NO_VALUE == valueType) { - return SlotDef::VECTOR_SPARSE_NON_VALUE; - } else { - return SlotDef::VECTOR_SPARSE_VALUE; - } - } - if (type == typeid(GpuSparseMatrix)) { - auto valueType = - std::dynamic_pointer_cast(arg.value)->getValueType(); - if (NO_VALUE == valueType) { - return SlotDef::VECTOR_SPARSE_NON_VALUE; - } else { - return SlotDef::VECTOR_SPARSE_VALUE; - } - } - - LOG(FATAL) << "Unknown matrix type"; - } - if (arg.ids) return SlotDef::INDEX; - if (arg.strs) return SlotDef::STRING; - LOG(FATAL) << "Invalid argument"; - return SlotDef::VECTOR_DENSE; -} - -void getColRow(const Argument& arg, - int64_t pos, - bool useGpu, - int* colNum, - const int** rowCols, - const real** rowValues) { - SlotDef::SlotType type = getSlotType(arg); - GpuSparseMatrixPtr matGpu; - CpuSparseMatrixPtr matCpu; - if (useGpu) { - matGpu = dynamic_pointer_cast(arg.value); - ASSERT_TRUE(matGpu != NULL); - } else { - matCpu = dynamic_pointer_cast(arg.value); - ASSERT_TRUE(matCpu != NULL); - } - *colNum = useGpu ? matGpu->getColNum(pos) : matCpu->getColNum(pos); - *rowCols = useGpu ? matGpu->getRowCols(pos) : matCpu->getRowCols(pos); - if (type == SlotDef::VECTOR_SPARSE_VALUE) { - *rowValues = useGpu ? matGpu->getRowValues(pos) : matCpu->getRowValues(pos); - } else { - *rowValues = NULL; - } -} - -void makeSample(const vector& arguments, - int64_t pos, - bool isBeginning, - DataSample* sample, - bool useGpu) { - sample->set_is_beginning(isBeginning); - int slotid = 0; - for (auto& arg : arguments) { - SlotDef::SlotType type = getSlotType(arg); - int64_t dim = getSlotDim(arg); - switch (type) { - case SlotDef::VECTOR_DENSE: { - VectorSlot* vecSlot = sample->add_vector_slots(); - auto values = vecSlot->mutable_values(); - values->Reserve(dim); - for (int i = 0; i < dim; ++i) { - values->AddAlreadyReserved( - static_cast(arg.value->getElement(pos, i))); - } - break; - } - case SlotDef::INDEX: { - sample->add_id_slots(arg.ids->get(pos)); - break; - } - case SlotDef::VECTOR_SPARSE_NON_VALUE: { - VectorSlot* vecSlot = sample->add_vector_slots(); - auto ids = vecSlot->mutable_ids(); - int colNum; - const int* rowCols; - const real* rowValues; // nullptr - getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues); - ids->Reserve(colNum); - for (int i = 0; i < colNum; ++i) { - ids->AddAlreadyReserved(rowCols[i]); - } - SubseqSlot* subseqSlot = sample->add_subseq_slots(); // subseq - subseqSlot->set_slot_id(slotid); - auto lens = subseqSlot->mutable_lens(); - lens->Add(colNum); - break; - } - case SlotDef::VECTOR_SPARSE_VALUE: { - VectorSlot* vecSlot = sample->add_vector_slots(); - auto values = vecSlot->mutable_values(); - auto ids = vecSlot->mutable_ids(); - int colNum; - const int* rowCols; - const real* rowValues; - getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues); - ids->Reserve(colNum); - values->Reserve(colNum); - for (int i = 0; i < colNum; ++i) { - ids->AddAlreadyReserved(rowCols[i]); - values->AddAlreadyReserved(rowValues[i]); - } - break; - } - case SlotDef::VAR_MDIM_DENSE: - case SlotDef::VAR_MDIM_INDEX: { - LOG(FATAL) << "Not implemented"; - break; - } - case SlotDef::STRING: { - VectorSlot* vecSlot = sample->add_vector_slots(); - vecSlot->add_strs((*arg.strs)[pos]); - break; - } - } - slotid++; - } -} - -void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) { - DataHeader header; - const vector& arguments = batch.getStreams(); - for (auto& argument : arguments) { - SlotDef* slotDef = header.add_slot_defs(); - slotDef->set_type(getSlotType(argument)); - slotDef->set_dim(getSlotDim(argument)); - } - VLOG(1) << "header=" << header.DebugString(); - - int64_t totalSeqs = batch.getNumSequences(); - int64_t seq = 0; - ICpuGpuVectorPtr sequenceStartPositions = arguments[0].sequenceStartPositions; - int64_t numWritten = 0; - vector curProtoFiles = - dataCompression ? protoFilesCompressed : protoFiles; - for (size_t i = 0; i < curProtoFiles.size(); ++i) { - int64_t numSeqs = totalSeqs * (i + 1) / curProtoFiles.size() - - totalSeqs * i / curProtoFiles.size(); - ofstream os(curProtoFiles[i]); - CHECK(os) << "Fail to open " << curProtoFiles[i]; - unique_ptr writer(new ProtoWriter(&os, dataCompression)); - CHECK(writer->write(header)); - for (int j = 0; j < numSeqs; ++j, ++seq) { - int64_t begin = seq; - int64_t end = seq + 1; - if (sequenceStartPositions) { - begin = sequenceStartPositions->getElement(seq); - end = sequenceStartPositions->getElement(seq + 1); - } - for (int pos = begin; pos < end; ++pos) { - DataSample sample; - makeSample(arguments, pos, pos == begin, &sample, useGpu); - CHECK(writer->write(sample)); - ++numWritten; - } - } - - writer.reset(nullptr); - os.close(); - } - CHECK_EQ(arguments[0].getBatchSize(), numWritten); -} - -// check that the sample at pos1 in args1 is same as the sample at pos2 in args2 -void checkSample(const vector& args1, - int64_t pos1, - const vector& args2, - int64_t pos2, - bool useGpu) { - EXPECT_EQ(args1.size(), args2.size()); - VLOG(1) << " pos1=" << pos1 << " pos2=" << pos2; - - for (size_t i = 0; i < args1.size(); ++i) { - auto type = getSlotType(args1[i]); - int dim = getSlotDim(args1[i]); - EXPECT_EQ(type, getSlotType(args2[i])); - if (type == SlotDef::INDEX) { - EXPECT_GE(dim, getSlotDim(args2[i])); - } else { - EXPECT_EQ(dim, getSlotDim(args2[i])); - } - switch (type) { - case SlotDef::VECTOR_DENSE: { - for (int j = 0; j < dim; ++j) { - EXPECT_EQ(static_cast(args1[i].value->getElement(pos1, j)), - static_cast(args2[i].value->getElement(pos2, j))); - } - break; - } - case SlotDef::INDEX: { - EXPECT_EQ(args1[i].ids->get(pos1), args2[i].ids->get(pos2)); - break; - } - case SlotDef::VECTOR_SPARSE_NON_VALUE: - case SlotDef::VECTOR_SPARSE_VALUE: { - int colNum1, colNum2; - const int *rowCols1, *rowCols2; - const real *rowValues1, *rowValues2; - getColRow(args1[i], pos1, useGpu, &colNum1, &rowCols1, &rowValues1); - getColRow(args2[i], pos2, useGpu, &colNum2, &rowCols2, &rowValues2); - EXPECT_EQ(colNum1, colNum2); - for (int j = 0; j < colNum1; ++j) { - EXPECT_EQ(rowCols1[j], rowCols2[j]); - if (type == SlotDef::VECTOR_SPARSE_VALUE) { - EXPECT_EQ(rowValues1[j], rowValues2[j]); - } - } - break; - } - case SlotDef::VAR_MDIM_DENSE: - case SlotDef::VAR_MDIM_INDEX: { - LOG(FATAL) << "Not implemented"; - break; - } - case SlotDef::STRING: { - EXPECT_EQ((*args1[i].strs)[pos1], (*args2[i].strs)[pos2]); - break; - } - } - } -} - -void testProtoDataProvider(int* numPerSlotType, - bool iid, - bool async, - bool useGpu, - bool dataCompression, - int numConstantSlots = 0) { - mkDir(kTestDir); - DataBatch data; - - prepareData(&data, numPerSlotType, iid, useGpu); - writeData(data, useGpu, dataCompression); - - DataConfig config; - config.set_type("proto"); - config.set_files(dataCompression ? kProtoFileListCompressed : kProtoFileList); - config.set_async_load_data(async); - - for (int i = 0; i < numConstantSlots; ++i) { - config.add_constant_slots(i + 11); - MatrixPtr w = Matrix::create(data.getSize(), - 1, - /* trans= */ false, - /* useGpu= */ false); - w->assign(config.constant_slots(i)); - data.appendData(w); - } - - unique_ptr dataProvider(DataProvider::create(config, useGpu)); - dataProvider->setSkipShuffle(); - - EXPECT_EQ(data.getSize(), dataProvider->getSize()); - - int64_t batchSize = 10; - DataBatch batch; - - size_t seq1 = 0; - vector& args1 = data.getStreams(); - ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions; - - dataProvider->reset(); - - while (dataProvider->getNextBatch(batchSize, &batch) > 0) { - CHECK_EQ(data.getNumStreams(), batch.getNumStreams()); - vector& args2 = batch.getStreams(); - ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions; - for (auto& arg : args2) { - EXPECT_EQ(iid, !arg.sequenceStartPositions); - } - size_t numSeqs = batch.getNumSequences(); - VLOG(1) << "numSeqs=" << numSeqs; - for (size_t seq2 = 0; seq2 < numSeqs; ++seq1, ++seq2) { - int64_t begin1 = seq1; - int64_t end1 = seq1 + 1; - if (sequenceStartPositions1) { - begin1 = sequenceStartPositions1->getElement(seq1); - end1 = sequenceStartPositions1->getElement(seq1 + 1); - EXPECT_LT(seq1, sequenceStartPositions1->getSize() - 1); - } - - int64_t begin2 = seq2; - int64_t end2 = seq2 + 1; - if (sequenceStartPositions2) { - begin2 = sequenceStartPositions2->getElement(seq2); - end2 = sequenceStartPositions2->getElement(seq2 + 1); - } - VLOG(1) << " begin1=" << begin1 << " end1=" << end1 - << " begin2=" << begin2 << " end2=" << end2; - EXPECT_EQ(end1 - begin1, end2 - begin2); - for (int i = 0; i < end1 - begin1; ++i) { - checkSample(args1, begin1 + i, args2, begin2 + i, useGpu); - } - } - } - - EXPECT_EQ(seq1, (size_t)data.getNumSequences()); - rmDir(kTestDir); -} - -TEST(ProtoDataProvider, test) { - int numSlotsArray[] = {0, 3}; - int numTwoArray[] = {0, 1}; - int numSlotsArraySize = sizeof(numSlotsArray) / sizeof(numSlotsArray[0]); - const int numSlot = 5; - int combination[numSlot] = {0}; - int k = numSlot - 1; - while (k >= 0) { - int numDenseVecSlots = numSlotsArray[combination[0]]; - int numSparseNonValueVecSlots = numSlotsArray[combination[1]]; - int numSparseValueVectorSlots = numSlotsArray[combination[2]]; - int numStrSlots = numSlotsArray[combination[3]]; - int numIdSlots = numSlotsArray[combination[4]]; - // while loop : traverse all cases - k = numSlot - 1; - while (k >= 0) { - if (combination[k] < (numSlotsArraySize - 1)) { - ++combination[k]; - break; - } else { - combination[k] = 0; - --k; - } - } - if (numDenseVecSlots + numSparseNonValueVecSlots + - numSparseValueVectorSlots + numStrSlots + numIdSlots < - 1) - continue; - for (int iid : numTwoArray) { - for (int async : numTwoArray) { - for (int useGpu : numTwoArray) { - for (int dataCompression : numTwoArray) { - if (async && useGpu) { - // Currently in async mode, useGpu is not supported - continue; - } -#ifndef PADDLE_WITH_CUDA - if (useGpu) { - continue; - } -#endif - LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots - << " numSparseNonValueVecSlots=" - << numSparseNonValueVecSlots - << " numSparseValueVectorSlots=" - << numSparseValueVectorSlots - << " numStrSlots=" << numStrSlots - << " numIdSlots=" << numIdSlots << " iid=" << iid - << " async=" << async << " useGpu=" << useGpu - << " dataCompression=" << dataCompression; - int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0}; - numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots; - numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] = - numSparseNonValueVecSlots; - numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] = - numSparseValueVectorSlots; - numPerSlotType[SlotDef::INDEX] = numIdSlots; - numPerSlotType[SlotDef::STRING] = numStrSlots; - testProtoDataProvider( - numPerSlotType, iid, async, useGpu, dataCompression); - } // end for (int dataCompression : numTwoArray) - } // end for (int useGpu : numTwoArray) - } // end for (int async : numTwoArray) - } // end for (int iid : numTwoArray) - } // end for (while, traverse all slots) -} - -TEST(ProtoDataProvider, constant_slots) { - int numSlotsArray[] = {0, 3}; - int numTwoArray[] = {0, 1}; - for (int numDenseVecSlots : numSlotsArray) { - for (int numSparseNonValueVecSlots : numSlotsArray) { - if (numDenseVecSlots + numSparseNonValueVecSlots < 1) continue; - for (int numConstantSlots : {1, 2}) { - for (int useGpu : numTwoArray) { - for (int dataCompression : numTwoArray) { -#ifndef PADDLE_WITH_CUDA - if (useGpu) { - continue; - } -#endif - LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots - << " numSparseNonValueVecSlots=" - << numSparseNonValueVecSlots - << " numConstantSlogs=" << numConstantSlots - << " useGpu=" << useGpu - << " dataCompression=" << dataCompression; - int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0}; - numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots; - numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] = - numSparseNonValueVecSlots; - numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] = 1; - numPerSlotType[SlotDef::INDEX] = 1; - testProtoDataProvider(numPerSlotType, - /* iid= */ true, - /* async= */ false, - useGpu, - dataCompression, - numConstantSlots); - } // end for (int dataCompression : numTwoArray) - } // end for (int useGpu : numTwoArray) - } // end for (int numConstantSlots : {1, 2}) - } // end for (int numSparseNonValueVecSlots : numSlotsArray) - } // end for (int numDenseVecSlots : numSlotsArray) -} - -void checkSampleSequence(const vector& args1, - const vector& args2, - int64_t offset, - int64_t numSeqs, - bool useGpu) { - // check slot num are equal - EXPECT_EQ(args1.size(), args2.size()); - for (size_t i = 0; i < args1.size(); i++) { - auto type = getSlotType(args1[i]); - // check for args2: sequenceStartPositions vs numSeqs - // (1) size - EXPECT_EQ(args2[i].sequenceStartPositions->getSize(), (size_t)numSeqs + 1); - // (2) content - auto checkArgContent = [&](const Argument& args, int numSeqs) { - for (int j = 0; j <= numSeqs; j++) { - int start_pos = args.sequenceStartPositions->getElement(j); - EXPECT_EQ(start_pos, j); - } - }; - switch (type) { - case SlotDef::INDEX: { - // args1: for label - checkArgContent(args2[i], numSeqs); - // check for args2: ids are equal to args1[offset] - // (1) size - EXPECT_EQ(args2[i].ids->getSize(), (size_t)numSeqs); - // (2) content - for (int j = 0; j < numSeqs; j++) { - EXPECT_EQ(args2[i].ids->get(j), args1[i].ids->get(offset + j)); - } - break; - } - case SlotDef::VECTOR_SPARSE_NON_VALUE: { - // args1: for sparse_non_value - // args2 should put sparse indexes in ids - int colNum1; - const int* rowCols1; - const real* rowValues1; // nullptr - int totalLength = 0; - for (int j = 0; j < numSeqs; j++) { - getColRow( - args1[i], offset + j, useGpu, &colNum1, &rowCols1, &rowValues1); - // (1) lengths - EXPECT_EQ(totalLength, - args2[i].sequenceStartPositions->getElement(j)); - EXPECT_EQ(totalLength, - args2[i].subSequenceStartPositions->getElement(j)); - // (2) content - for (int k = 0; k < colNum1; k++) { - EXPECT_EQ(rowCols1[k], args2[i].ids->get(totalLength + k)); - } - totalLength += colNum1; - if (colNum1 == 0) { - // special case here: we will put a "-1" into ids when column num is - // zero. see ProtoSequenceDataProvider::getNextBatchInternal. - EXPECT_EQ(-1, args2[i].ids->get(totalLength)); - totalLength++; - } - } - EXPECT_EQ(totalLength, - args2[i].sequenceStartPositions->getElement(numSeqs)); - EXPECT_EQ(totalLength, - args2[i].subSequenceStartPositions->getElement(numSeqs)); - break; - } - case SlotDef::VECTOR_DENSE: { - // args1: for dense vector - checkArgContent(args2[i], numSeqs); - // check for args2: values are equal to args1[offset] - // (1) size - EXPECT_EQ(args2[i].value->getHeight(), (size_t)numSeqs); - EXPECT_EQ(args2[i].value->getWidth(), (size_t)getSlotDim(args1[i])); - // (2) content - for (int j = 0; j < numSeqs; j++) { - for (size_t k = 0; k < args2[i].value->getWidth(); k++) { - EXPECT_EQ( - static_cast(args1[i].value->getElement(j + offset, k)), - static_cast(args2[i].value->getElement(j, k))); - } - } - break; - } - default: { EXPECT_EQ(true, false) << "should not reach here"; } - } - } -} - -void testProtoSequenceDataProvider(int* numPerSlotType, - bool async, - bool useGpu) { - mkDir(kTestDir); - DataBatch data; - - prepareData(&data, - numPerSlotType, - /* iid */ true, - useGpu); - writeData(data, useGpu, /* dataCompression */ false); - - DataConfig config; - config.set_type("proto_sequence"); - config.set_files(kProtoFileList); - config.set_async_load_data(async); - - unique_ptr dataProvider(DataProvider::create(config, useGpu)); - dataProvider->setSkipShuffle(); - - EXPECT_EQ(data.getSize(), dataProvider->getSize()); - - int64_t batchSize = 10; - DataBatch batch; - - vector& args1 = data.getStreams(); - ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions; - - dataProvider->reset(); - - size_t args1Offset = 0; - while (dataProvider->getNextBatch(batchSize, &batch) > 0) { - CHECK_EQ(data.getNumStreams(), batch.getNumStreams()); - vector& args2 = batch.getStreams(); - ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions; - for (auto& arg : args1) { - // args1 should not has sequence - EXPECT_EQ(true, !arg.sequenceStartPositions); - } - for (auto& arg : args2) { - // args2 should has sequence - EXPECT_NE(true, !arg.sequenceStartPositions); - } - size_t numSeqs = batch.getNumSequences(); - checkSampleSequence(args1, args2, args1Offset, numSeqs, useGpu); - args1Offset += numSeqs; - } - - EXPECT_EQ(args1Offset, (size_t)data.getNumSequences()); - rmDir(kTestDir); -} - -TEST(ProtoSequenceDataProvider, test) { - int numSlotsArray[] = {0, 3}; - int numTwoArray[] = {0, 1}; - for (int numSparseNonValueVecSlots : numSlotsArray) { - for (int numIdSlots : numSlotsArray) { - for (int numDenseVecSlots : numSlotsArray) { - if (numDenseVecSlots + numSparseNonValueVecSlots + numIdSlots < 1) - continue; - for (int async : numTwoArray) { - for (int useGpu : numTwoArray) { - if (async && useGpu) { - // Currently in async mode, useGpu is not supported - continue; - } -#ifndef PADDLE_WITH_CUDA - if (useGpu) { - continue; - } -#endif - LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots - << " numSparseNonValueVecSlots=" - << numSparseNonValueVecSlots - << " numIdSlots=" << numIdSlots << " async=" << async - << " useGpu=" << useGpu; - int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0}; - numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots; - numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] = - numSparseNonValueVecSlots; - numPerSlotType[SlotDef::INDEX] = numIdSlots; - testProtoSequenceDataProvider(numPerSlotType, async, useGpu); - } // end for (int useGpu : numTwoArray) - } // end for (int async : numTwoArray) - } // end for (int numDenseVecSlots : numSlotsArray) - } // end for (int numIdSlots : numSlotsArray) - } // end for (int numSparseNonValueVecSlots : numSlotsArray) -} diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp index 4adaaef9838f0d178468af3af142031325bfc11d..a2ef731ecbcd18ca4bd0b2381de04650a2686c2d 100644 --- a/paddle/math/Storage.cpp +++ b/paddle/math/Storage.cpp @@ -17,9 +17,13 @@ limitations under the License. */ #include "paddle/utils/StringUtil.h" #include "paddle/utils/Util.h" +#ifndef PADDLE_MOBILE_INFERENCE DEFINE_int32(pool_limit_size, 536870912, "maximum memory size managed by a memory pool, default is 512M"); +#else +DEFINE_int32(pool_limit_size, 0, "default is 0"); +#endif namespace paddle { diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index a719da2560291dbc7e98aadfae41d4692d8afcad..46c2833030c936119e98adcdd338245bbdaddce7 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -61,6 +61,18 @@ function(op_library TARGET) set(pybind_flag 1) endif() + if ("${TARGET}" STREQUAL "compare_op") + set(pybind_flag 1) + file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n") + endif() + + # conv_op contains several operators + if ("${TARGET}" STREQUAL "conv_op") + set(pybind_flag 1) + # It's enough to just adding one operator to pybind + file(APPEND ${pybind_file} "USE_OP(conv2d);\n") + endif() + # pool_op contains several operators if ("${TARGET}" STREQUAL "pool_op") set(pybind_flag 1) @@ -68,9 +80,11 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP(pool2d);\n") endif() - if ("${TARGET}" STREQUAL "compare_op") + # pool_cudnn_op contains several operators + if ("${TARGET}" STREQUAL "pool_cudnn_op") set(pybind_flag 1) - file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n") + # It's enough to just adding one operator to pybind + file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n") endif() # pool_with_index_op contains several operators @@ -80,25 +94,18 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n") endif() - # conv_op contains several operators - if ("${TARGET}" STREQUAL "conv_op") - set(pybind_flag 1) - # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_OP(conv2d);\n") - endif() - # conv_transpose_op contains several operators if ("${TARGET}" STREQUAL "conv_transpose_op") set(pybind_flag 1) # It's enough to just adding one operator to pybind file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n") endif() - - # pool_cudnn_op contains several operators - if ("${TARGET}" STREQUAL "pool_cudnn_op") + + # conv_transpose_cudnn_op contains two operators + if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op") set(pybind_flag 1) # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n") + file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n") endif() # save_restore_op contains several operators diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h index 666043e824f885e9c0e79e319d0a38ba108c209a..233a81198e336d3190565fb18556f96979cec0ce 100644 --- a/paddle/operators/array_operator.h +++ b/paddle/operators/array_operator.h @@ -42,6 +42,7 @@ class ArrayOp : public framework::OperatorBase { } else { offset = static_cast(*i_tensor.data()); } + VLOG(10) << " Offset = " << offset; return offset; } }; diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h index ffa4f43a327418498c1f110504127e7d2878409d..1113a4c6f357edb4f6b14b73c6eec9c6cca24ce5 100644 --- a/paddle/operators/bilinear_tensor_product_op.h +++ b/paddle/operators/bilinear_tensor_product_op.h @@ -174,7 +174,7 @@ class BilinearTensorProductGradKernel : public framework::OpKernel { // Caculate the gradient of Input(Bias). if (d_bias) { d_bias->mutable_data(ctx.GetPlace()); - auto d_bias_mat = EigenMatrix::From(*d_bias); + auto d_bias_mat = framework::EigenVector::Flatten(*d_bias); d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes(0)); } } diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc index 2aec4a2760260623c4c7054c590afa8e1c6c3fea..4900f7b086c869b496c492743c71ab7047c5f672 100644 --- a/paddle/operators/conv_cudnn_op.cu.cc +++ b/paddle/operators/conv_cudnn_op.cu.cc @@ -226,9 +226,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel { T alpha = 1.0f, beta = 0.0f; if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - auto t = framework::EigenVector::Flatten(*input_grad); - t.device(ctx.GetEigenDevice()) = - t.constant(static_cast(0)); + // Because beta is zero, it is unnecessary to reset input_grad. + for (int i = 0; i < groups; i++) { PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( handle, &alpha, cudnn_filter_desc, @@ -241,9 +240,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel { // ------------------- cudnn conv backward filter --------------------- if (filter_grad) { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); - auto t = framework::EigenVector::Flatten(*filter_grad); - t.device(ctx.GetEigenDevice()) = - t.constant(static_cast(0)); + // Because beta is zero, it is unnecessary to reset filter_grad. + for (int i = 0; i < groups; i++) { PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc index 687d741cb22a081eab18c61752200b9fd48f68a7..7a36a9b21aa6a1b415ac5a232e65eda8051c87f8 100644 --- a/paddle/operators/conv_op.cc +++ b/paddle/operators/conv_op.cc @@ -225,11 +225,15 @@ REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, ops::ConvOpGrad); REGISTER_OP_CPU_KERNEL(conv2d, - ops::GemmConvKernel); + ops::GemmConvKernel, + ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( - conv2d_grad, ops::GemmConvGradKernel); + conv2d_grad, ops::GemmConvGradKernel, + ops::GemmConvGradKernel); REGISTER_OP_CPU_KERNEL(conv3d, - ops::GemmConvKernel); + ops::GemmConvKernel, + ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( - conv3d_grad, ops::GemmConvGradKernel); + conv3d_grad, ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc index 8e6f9da455b7291049aee57189dae15b8bcc2150..546451234a1ed1a4d3119cb175c6d37ae3f0aac1 100644 --- a/paddle/operators/conv_op.cu.cc +++ b/paddle/operators/conv_op.cu.cc @@ -17,11 +17,15 @@ namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL(conv2d, - ops::GemmConvKernel); + ops::GemmConvKernel, + ops::GemmConvKernel); REGISTER_OP_GPU_KERNEL( - conv2d_grad, ops::GemmConvGradKernel); + conv2d_grad, ops::GemmConvGradKernel, + ops::GemmConvGradKernel); REGISTER_OP_GPU_KERNEL(conv3d, - ops::GemmConvKernel); + ops::GemmConvKernel, + ops::GemmConvKernel); REGISTER_OP_GPU_KERNEL( - conv3d_grad, ops::GemmConvGradKernel); + conv3d_grad, ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/operators/conv2d_transpose_cudnn_op.cc b/paddle/operators/conv_transpose_cudnn_op.cc similarity index 61% rename from paddle/operators/conv2d_transpose_cudnn_op.cc rename to paddle/operators/conv_transpose_cudnn_op.cc index fce1357ce5af5f11ccc5941690431393301e6725..dbd1bc3c3bc2d026f13ddcf62919db6cf7d87bc5 100644 --- a/paddle/operators/conv2d_transpose_cudnn_op.cc +++ b/paddle/operators/conv_transpose_cudnn_op.cc @@ -23,7 +23,24 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker { framework::OpAttrChecker* op_checker) : Conv2DTransposeOpMaker(proto, op_checker) { AddAttr>("dilations", "dilations of convolution operator.") - .SetDefault(std::vector{1, 1}); + .SetDefault({1, 1}); + AddAttr("workspace_size_MB", + "workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardward. This size should be carefully setted.") + .SetDefault(4096); + } +}; + +class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker { + public: + CudnnConv3DTransposeOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : Conv3DTransposeOpMaker(proto, op_checker) { + AddAttr>("dilations", "dilations of convolution operator.") + .SetDefault({1, 1, 1}); AddAttr("workspace_size_MB", "workspace size for cudnn, in MB, " "workspace is a section of GPU memory which will be " @@ -48,3 +65,14 @@ REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL( conv2d_transpose_cudnn_grad, ops::GemmConvTransposeGradKernel); + +REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp, + ops::CudnnConv3DTransposeOpMaker, conv3d_transpose_cudnn_grad, + ops::ConvTransposeOpGrad); + +REGISTER_OP_CPU_KERNEL( + conv3d_transpose_cudnn, + ops::GemmConvTransposeKernel); +REGISTER_OP_CPU_KERNEL( + conv3d_transpose_cudnn_grad, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/operators/conv2d_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc similarity index 92% rename from paddle/operators/conv2d_transpose_cudnn_op.cu.cc rename to paddle/operators/conv_transpose_cudnn_op.cu.cc index eff058afc6cc5dacf2a054a33f352824865c1924..e2ba77086e737a07471f14e483cbd32ab1d4ee12 100644 --- a/paddle/operators/conv2d_transpose_cudnn_op.cu.cc +++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc @@ -54,15 +54,21 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel { ScopedTensorDescriptor output_desc; ScopedFilterDescriptor filter_desc; ScopedConvolutionDescriptor conv_desc; - DataLayout layout = DataLayout::kNCHW; + DataLayout layout; + + if (strides.size() == 2U) { + layout = DataLayout::kNCHW; + } else { + layout = DataLayout::kNCDHW; + } - // N, M, H, W + // (N, M, H, W) or (N, M, D, H, W) cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, framework::vectorize2int(input->dims())); - // N, C, O_h, O_w + // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( layout, framework::vectorize2int(output->dims())); - // M, C, K_h, K_w + // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w) cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( layout, framework::vectorize2int(filter->dims())); cudnnConvolutionDescriptor_t cudnn_conv_desc = @@ -136,13 +142,13 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { ScopedConvolutionDescriptor conv_desc; DataLayout layout = DataLayout::kNCHW; - // Input: (N, M, H, W) + // Input: (N, M, H, W) or (N, M, D, H, W) cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, framework::vectorize2int(input->dims())); - // Output: (N, C, O_H, O_W) + // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( layout, framework::vectorize2int(output_grad->dims())); - // Filter (M, C, K_H, K_W) + // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w) cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( layout, framework::vectorize2int(filter->dims())); @@ -200,8 +206,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { T alpha = 1.0f, beta = 0.0f; if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - math::set_constant(ctx.device_context(), input_grad, 0); - + // Because beta is zero, it is unnecessary to reset input_grad. PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo, @@ -212,8 +217,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { // ------------------- cudnn conv backward filter --------------------- if (filter_grad) { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); - math::set_constant(ctx.device_context(), filter_grad, 0); - + // Because beta is zero, it is unnecessary to reset filter_grad. // Gradient with respect to the filter PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc, @@ -234,3 +238,8 @@ REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn, ops::CudnnConvTransposeOpKernel); REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad, ops::CudnnConvTransposeGradOpKernel); + +REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn, + ops::CudnnConvTransposeOpKernel); +REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn_grad, + ops::CudnnConvTransposeGradOpKernel); diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc index 13ac0cd54cbeb8f68c2246f7e1d02f032266a72e..3e55ef036a7fb976117054574d1347fa943acd55 100644 --- a/paddle/operators/conv_transpose_op.cc +++ b/paddle/operators/conv_transpose_op.cc @@ -30,11 +30,6 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); - for (size_t i = 0; i < paddings.size(); ++i) { - PADDLE_ENFORCE_EQ(paddings[i], 0, - "No Padding allowed in conv transpose op."); - } - PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, "ConvTransposeOp intput should be 4-D or 5-D tensor."); PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(), @@ -52,7 +47,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { std::vector output_shape({in_dims[0], filter_dims[1]}); for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back((in_dims[i + 2] - 1) * strides[i] + + output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] + filter_dims[i + 2]); } ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); @@ -190,17 +185,21 @@ REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker, REGISTER_OP_CPU_KERNEL( conv2d_transpose, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_CPU_KERNEL( conv2d_transpose_grad, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker, conv3d_transpose_grad, ops::ConvTransposeOpGrad); REGISTER_OP_CPU_KERNEL( conv3d_transpose, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_CPU_KERNEL( conv3d_transpose_grad, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/operators/conv_transpose_op.cu.cc b/paddle/operators/conv_transpose_op.cu.cc index 401cddb379ced134b800d2a078fe130a2850fbb2..4165eb0c7b048b83bbd94c57b971530043b66545 100644 --- a/paddle/operators/conv_transpose_op.cu.cc +++ b/paddle/operators/conv_transpose_op.cu.cc @@ -18,14 +18,18 @@ namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL( conv2d_transpose, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_GPU_KERNEL( conv2d_transpose_grad, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); REGISTER_OP_GPU_KERNEL( conv3d_transpose, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_GPU_KERNEL( conv3d_transpose_grad, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h index 4b2bd60437da8f58054d8cdd5e6ba1fdac05f0d5..ab336ad23ce1c180b68d04e4c85b299e301d5376 100644 --- a/paddle/operators/conv_transpose_op.h +++ b/paddle/operators/conv_transpose_op.h @@ -62,7 +62,6 @@ class GemmConvTransposeKernel : public framework::OpKernel { Tensor* output = context.Output("Output"); std::vector strides = context.Attr>("strides"); - // Actually, no paddings and groups allowed in conv transpose. std::vector paddings = context.Attr>("paddings"); // TODO(Zhuoyuan): Paddings can be added in future. // groups will alway be disabled in conv2dtranspose. @@ -148,8 +147,8 @@ class GemmConvTransposeKernel : public framework::OpKernel { } else if (filter_shape_vec.size() == 3) { // col2vol: col_matrix -> dy // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w) - col2vol(context.device_context(), col, dilations, strides, - std::vector{0, 0, 0}, &output_batch); + col2vol(context.device_context(), col, dilations, strides, paddings, + &output_batch); } } } @@ -173,7 +172,6 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { if ((!input_grad) && (!filter_grad)) return; std::vector strides = context.Attr>("strides"); - // Actually, no paddings and groups allowed in conv transpose. std::vector paddings = context.Attr>("paddings"); const int batch_size = static_cast(input->dims()[0]); diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index 68c56f531f941e1b8f66ac7ba6bf318881642c4f..62a4e484eceeabc4cc26e68ac54a50be1ac95df7 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -132,7 +132,7 @@ class CosSimGradKernel : public framework::OpKernel { // compute dy if (out_grad_y) { out_grad_y->mutable_data(context.GetPlace()); - auto dy = EigenMatrix::Reshape(*out_grad_y, 1); + auto dy = EigenVector::Flatten(*out_grad_y); auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast; dy.device(place) = (dz_bcast * grad).sum(Eigen::array({{0}})); } diff --git a/paddle/operators/detail/safe_ref.h b/paddle/operators/detail/safe_ref.h new file mode 100644 index 0000000000000000000000000000000000000000..b71af17309f9f46b5c87f0f479d4e03443fa7f93 --- /dev/null +++ b/paddle/operators/detail/safe_ref.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +namespace paddle { +namespace operators { +namespace detail { +/** + * Get Reference From Pointer with check. The error message is printf format, + * and passed by `args` + */ +template +inline T &Ref(T *ptr, ARGS &&... args) { + PADDLE_ENFORCE(ptr != nullptr, args...); + return *ptr; +} +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc index 85871ebbfcd8ee38ef5e8078d1d6cb6bdda46a7b..985b5d1e865e513d833bff72dcd20a8f20851d8c 100644 --- a/paddle/operators/fill_constant_batch_size_like_op.cc +++ b/paddle/operators/fill_constant_batch_size_like_op.cc @@ -101,4 +101,7 @@ REGISTER_OPERATOR(fill_constant_batch_size_like, REGISTER_OP_CPU_KERNEL( fill_constant_batch_size_like, ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel); + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel); diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/operators/fill_constant_batch_size_like_op.cu.cc index 87e3697e2832e7c60a4293fe7126ae4c9c053e4d..9e7a1eeab863c962ca72908e561e12a04d5021c5 100644 --- a/paddle/operators/fill_constant_batch_size_like_op.cu.cc +++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc @@ -19,4 +19,7 @@ namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL( fill_constant_batch_size_like, ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel); + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel); diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc index 8ab39d4fb012b8fa3883f33e4d15be7918500354..95fb5932b8b555e1357adc9fdfb7b6e6db7da71d 100644 --- a/paddle/operators/fill_zeros_like_op.cc +++ b/paddle/operators/fill_zeros_like_op.cc @@ -54,5 +54,8 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp, ops::FillZerosLikeOpMaker); REGISTER_OP_CPU_KERNEL( - fill_zeros_like, - ops::FillZerosLikeKernel); + fill_zeros_like, ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/operators/fill_zeros_like_op.cu.cc b/paddle/operators/fill_zeros_like_op.cu.cc index 2adb40cf90b42a5ba608302f7985346c949ff6ed..1501a17441072223ba0e8cf5b6c8cdd5e903a467 100644 --- a/paddle/operators/fill_zeros_like_op.cu.cc +++ b/paddle/operators/fill_zeros_like_op.cu.cc @@ -17,5 +17,8 @@ namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL( - fill_zeros_like, - ops::FillZerosLikeKernel); + fill_zeros_like, ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h index 55e9cc4a98bd6d36ce5d6bb4116039d0ec18b485..1b18368e0e16365682520b62a7f6adab0cbb527f 100644 --- a/paddle/operators/gru_op.h +++ b/paddle/operators/gru_op.h @@ -24,8 +24,17 @@ namespace paddle { namespace operators { -using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +inline void ReorderInitState(const platform::DeviceContext& ctx, + const framework::Tensor& src, const size_t* index, + framework::Tensor* dst, bool indexed_src) { + math::CopyMatrixRowsFunctor row_shuffle; + dst->mutable_data(src.dims(), ctx.GetPlace()); + row_shuffle(ctx, src, index, *dst, indexed_src); +} template class GRUKernel : public framework::OpKernel { @@ -33,7 +42,6 @@ class GRUKernel : public framework::OpKernel { void BatchCompute(const framework::ExecutionContext& context) const { auto* input = context.Input("Input"); auto* h0 = context.Input("H0"); - const T* h0_data = h0 ? h0->data() : nullptr; auto* weight = context.Input("Weight"); const T* weight_data = weight->data(); auto* bias = context.Input("Bias"); @@ -66,7 +74,18 @@ class GRUKernel : public framework::OpKernel { gru_value.gateWeight = const_cast(weight_data); gru_value.stateWeight = const_cast(weight_data + 2 * frame_size * frame_size); - gru_value.prevOutValue = const_cast(h0_data); + Tensor ordered_h0; + const size_t* order = batch_gate->lod()[2].data(); + if (h0) { + // Since the batch computing for GRU reorders the input sequences + // according to their length. The initialized cell state also needs + // to reorder. + ReorderInitState(context.device_context(), *h0, order, + &ordered_h0, true); + gru_value.prevOutValue = ordered_h0.data(); + } else { + gru_value.prevOutValue = nullptr; + } auto batch_starts = batch_gate->lod()[0]; size_t num_batch = batch_starts.size() - 1; for (size_t n = 0; n < num_batch; n++) { @@ -102,7 +121,6 @@ class GRUGradKernel : public framework::OpKernel { public: void BatchCompute(const framework::ExecutionContext& context) const { auto* h0 = context.Input("H0"); - const T* h0_data = h0 ? h0->data() : nullptr; auto* weight = context.Input("Weight"); const T* weight_data = weight->data(); auto* batch_gate = context.Input("BatchGate"); @@ -135,6 +153,17 @@ class GRUGradKernel : public framework::OpKernel { zero(dev_ctx, &batch_gate_grad, static_cast(0.0)); zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast(0.0)); + Tensor ordered_h0, ordered_h0_grad; + const size_t* order = batch_gate->lod()[2].data(); + if (h0) { + ReorderInitState(context.device_context(), *h0, order, + &ordered_h0, true); + } + if (h0_grad) { + ordered_h0_grad.mutable_data(h0_grad->dims(), context.GetPlace()); + zero(context.device_context(), &ordered_h0_grad, static_cast(0.0)); + } + bool is_reverse = context.Attr("is_reverse"); batch_hidden_grad.set_lod(batch_hidden->lod()); to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse); @@ -176,14 +205,9 @@ class GRUGradKernel : public framework::OpKernel { batch_reset_hidden_prev_grad.Slice(bstart, bend); gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data(); if (n == 0) { - gru_value.prevOutValue = const_cast(h0_data); - if (h0_grad) { - T* h0_grad_data = h0_grad->mutable_data(context.GetPlace()); - zero(dev_ctx, h0_grad, static_cast(0.0)); - gru_grad.prevOutGrad = h0_grad_data; - } else { - gru_grad.prevOutGrad = nullptr; - } + gru_value.prevOutValue = h0 ? ordered_h0.data() : nullptr; + gru_grad.prevOutGrad = + h0 && h0_grad ? ordered_h0_grad.data() : nullptr; } else { int bstart_pre = static_cast(batch_starts[n - 1]); Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart); @@ -208,6 +232,10 @@ class GRUGradKernel : public framework::OpKernel { math::ColwiseSum col_sum; col_sum(dev_ctx, batch_gate_grad, bias_grad); } + if (h0 && h0_grad) { + ReorderInitState(context.device_context(), ordered_h0_grad, + order, h0_grad, false); + } } void Compute(const framework::ExecutionContext& context) const override { diff --git a/paddle/operators/is_empty_op.cc b/paddle/operators/is_empty_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..54fecf44e881b5c283c81580fd161da9808d253e --- /dev/null +++ b/paddle/operators/is_empty_op.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" + +namespace paddle { +namespace operators { + +constexpr char kInput[] = "X"; +constexpr char kOutput[] = "Out"; + +class IsEmptyOp : public framework::OperatorBase { + public: + IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::DeviceContext &dev_ctx) const override { + // get input + auto *var = scope.FindVar(Input(kInput)); + PADDLE_ENFORCE_NOT_NULL(var); + auto &tensor = var->Get(); + // get output + auto *out = scope.FindVar(Output(kOutput)); + PADDLE_ENFORCE_NOT_NULL(out); + auto *out_tensor = out->GetMutable(); + + out_tensor->Resize({1}); + out_tensor->mutable_data(platform::CPUPlace())[0] = + framework::product(tensor.dims()) == 0; + } +}; + +class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + IsEmptyOpProtoMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput(kInput, "(Tensor) Tensor which is to be checked."); + AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not."); + AddComment(R"DOC( +IsEmpty Operator which checks whether a tensor is empty. + +It will just return product(tensor.ddims()) > 0; + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp, + paddle::operators::IsEmptyOpProtoMaker); diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index b9417f1d7fdc663fff751328d18239af3dbb1216..002b68fecf4f1e294387357f0346d9926a2b2b5a 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -1,7 +1,7 @@ add_subdirectory(detail) if(WITH_GPU) - nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context) + nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto) nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor) nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function) nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor) @@ -15,7 +15,7 @@ if(WITH_GPU) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function) else() - cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context) + cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto) cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) cc_library(softmax SRCS softmax.cc DEPS device_context) cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context) diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu index 347df7a0ffdec163c0479a71ec775a813930ba5f..bf7894243919571c2ab15d53690b1ef05bfcc6ee 100644 --- a/paddle/operators/math/im2col.cu +++ b/paddle/operators/math/im2col.cu @@ -119,8 +119,8 @@ __global__ void col2im(int n, const T* data_col, int im_height, int im_width, if (index < n) { T val = 0; - int w = index % im_width; - int h = (index / im_width) % im_height; + int w = index % im_width + padding_width; + int h = (index / im_width) % im_height + padding_height; int c = index / (im_width * im_height); // compute the start and end of the output diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index 5ee091788687133f6eaef7229d9f95e2025a2daf..2e333a8cde721f8e65dbf2cf5e3aac6272172cc0 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -250,6 +250,8 @@ void axpy(const platform::DeviceContext& context, template struct SetConstant; template struct SetConstant; template struct SetConstant; +template struct SetConstant; +template struct SetConstant; #define DEFINE_CPU_TRANS(RANK) \ template struct Transpose; \ diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 38c04b97f9d07b9cca938b09f46ea81328a35322..58356a4b7783241ca0292829bf05dc1a8ed80c6c 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -256,6 +256,8 @@ void axpy(const platform::DeviceContext& context, template struct SetConstant; template struct SetConstant; template struct SetConstant; +template struct SetConstant; +template struct SetConstant; #define DEFINE_GPU_TRANS(RANK) \ template struct Transpose; \ diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc index 8711567b95fea355396173b5312d26d31f9ffb12..f9d8af3e1c5db49873979fdfeb17a32d16341a1a 100644 --- a/paddle/operators/pool_cudnn_op.cu.cc +++ b/paddle/operators/pool_cudnn_op.cu.cc @@ -135,8 +135,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel { if (input_grad) { T *input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - math::SetConstant set_zero; - set_zero(ctx.device_context(), input_grad, static_cast(0)); + // Because beta is zero, it is unnecessary to reset input_grad. PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward( handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc index 990053b8af9f6bfb982eaf994374daa45ce6cbfb..cbe0b4233160dd1f3ebdf6db8b5f6df392efdfe7 100755 --- a/paddle/operators/sequence_slice_op.cc +++ b/paddle/operators/sequence_slice_op.cc @@ -42,7 +42,8 @@ class SequenceSliceOp : public framework::OperatorWithKernel { length_dim.size(), 2UL, "Only support one level sequence now, The rank of Length must be 2."); - // Initialize the output's dims to maximum + // Initialize the output's dims to maximum, + // and re-set to real dims by the value of Offset and Length at kernel ctx->SetOutputDim("Out", input_dims); } diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h index c7d7ef4916756397c3d5911d7ec25305efc815c0..2ef2c8f0c4f3061c0082916edc18564698821bfc 100755 --- a/paddle/operators/sequence_slice_op.h +++ b/paddle/operators/sequence_slice_op.h @@ -143,6 +143,7 @@ class SequenceSliceGradOpKernel : public framework::OpKernel { if (x_grad) { x_grad->mutable_data(ctx.GetPlace()); + x_grad->set_lod(in->lod()); math::SetConstant set_zero; set_zero(ctx.device_context(), x_grad, static_cast(0)); diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc index 9837f325e30f68ba927a540d395cc7d7e093a607..c2b7632b2865a3ef66051d815d7722a08c6a8cbd 100644 --- a/paddle/operators/sum_op.cc +++ b/paddle/operators/sum_op.cc @@ -12,6 +12,7 @@ limitations under the License. */ #include "paddle/operators/sum_op.h" #include #include "paddle/framework/var_type_inference.h" +#include "paddle/operators/detail/safe_ref.h" namespace paddle { namespace operators { @@ -59,13 +60,16 @@ class SumOp : public framework::OperatorWithKernel { x_vars[0]->Get().value().type()), ctx.device_context()); } else if (x_vars[0]->IsType()) { - auto& array = x_vars[0]->Get(); - for (auto& each : array) { - if (each.numel() != 0) { - return framework::OpKernelType(framework::ToDataType(each.type()), - ctx.device_context()); + for (auto& x_var : x_vars) { + auto& array = x_var->Get(); + for (auto& each : array) { + if (each.numel() != 0) { + return framework::OpKernelType(framework::ToDataType(each.type()), + ctx.device_context()); + } } } + PADDLE_THROW("Cannot find the input data type by all input data"); } PADDLE_THROW("Unexpected branch. Input type is %s", x_vars[0]->Type().name()); @@ -96,6 +100,11 @@ class SumOpVarTypeInference : public framework::VarTypeInference { auto& inputs = op_desc.Input("X"); auto var_type = framework::VarDesc::SELECTED_ROWS; + for (auto& name : op_desc.Input("X")) { + VLOG(10) << name << " " + << block->FindRecursiveOrCreateVar(name)->GetType(); + } + bool any_input_is_lod_tensor = std::any_of( inputs.begin(), inputs.end(), [block](const std::string& name) { return block->FindRecursiveOrCreateVar(name)->GetType() == @@ -103,7 +112,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference { }); auto is_tensor_array = [block](const std::string& name) { - return block->FindRecursiveOrCreateVar(name)->GetType() == + return detail::Ref(block->FindRecursiveOrCreateVar(name)).GetType() == framework::VarDesc::LOD_TENSOR_ARRAY; }; @@ -113,14 +122,26 @@ class SumOpVarTypeInference : public framework::VarTypeInference { std::all_of(inputs.begin(), inputs.end(), is_tensor_array); if (any_input_is_tensor_array) { - PADDLE_ENFORCE(all_inputs_are_tensor_array); + if (!all_inputs_are_tensor_array) { + std::ostringstream os; + for (auto& each : inputs) { + os << " " << each << " type is " + << detail::Ref(block->FindRecursiveOrCreateVar(each)).GetType() + << "\n"; + } + PADDLE_ENFORCE(all_inputs_are_tensor_array, + "Not all inputs are tensor array:\n%s", os.str()); + } var_type = framework::VarDesc::LOD_TENSOR_ARRAY; } else if (any_input_is_lod_tensor) { var_type = framework::VarDesc::LOD_TENSOR; } auto out_var_name = op_desc.Output("Out").front(); - block->FindRecursiveOrCreateVar(out_var_name)->SetType(var_type); + auto& out_var = detail::Ref(block->FindRecursiveOrCreateVar(out_var_name)); + out_var.SetType(var_type); + auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front())); + out_var.SetDataType(in_var.GetDataType()); } }; diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index 62e15604c47f25c458abc69ecd1cabf964de39bb..ae1b48d7a8e3d573a5134a822a2ed5ef70511077 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/array_operator.h" - +#include "paddle/operators/detail/safe_ref.h" namespace paddle { namespace operators { @@ -33,6 +33,8 @@ class WriteToArrayOp : public ArrayOp { auto *out = scope.FindVar(Output("Out"))->GetMutable(); if (offset >= out->size()) { + VLOG(10) << "Resize " << Output("Out") << " from " << out->size() + << " to " << offset + 1; out->resize(offset + 1); } auto *out_tensor = &out->at(offset); @@ -85,11 +87,15 @@ class WriteToArrayInferVarType : public framework::VarTypeInference { public: void operator()(const framework::OpDescBind &op_desc, framework::BlockDescBind *block) const override { - for (auto &out_var : op_desc.OutputArgumentNames()) { - VLOG(10) << "Set Variable " << out_var << " as LOD_TENSOR_ARRAY"; - block->FindRecursiveOrCreateVar(out_var)->SetType( - framework::VarDesc::LOD_TENSOR_ARRAY); - } + auto x_name = op_desc.Input("X")[0]; + auto out_name = op_desc.Output("Out")[0]; + VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY"; + auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name), + "Cannot found %s", out_name); + out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY); + auto &x = + detail::Ref(block->FindVarRecursive(x_name), "Cannot found %s", x_name); + out.SetDataType(x.GetDataType()); } }; @@ -107,11 +113,11 @@ class ReadFromArrayOp : public ArrayOp { auto &x_array = x->Get(); auto *out = scope.FindVar(Output("Out")); PADDLE_ENFORCE(out != nullptr, "Out must be set"); - auto *out_tesnor = out->GetMutable(); + auto *out_tensor = out->GetMutable(); size_t offset = GetOffset(scope, dev_ctx); PADDLE_ENFORCE_LT(offset, x_array.size()); - out_tesnor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx); - out_tesnor->set_lod(x_array[offset].lod()); + out_tensor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx); + out_tensor->set_lod(x_array[offset].lod()); } }; diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 4ca6c8507a48507fd29a9c9acae2bdf36ed936ee..dcc59f5ff2ae3a8ca999d72a20cfd5c759987d89 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -14,8 +14,10 @@ #include #include "paddle/framework/executor.h" +#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" +#include "paddle/operators/detail/safe_ref.h" namespace paddle { namespace operators { @@ -26,8 +28,9 @@ using LoDTensor = framework::LoDTensor; constexpr char kStepBlock[] = "step_block"; constexpr char kCondition[] = "Condition"; constexpr char kStepScopes[] = "StepScopes"; -constexpr char kParamGrads[] = "X@Grad"; constexpr char kParameters[] = "X"; +constexpr char kParamGrads[] = "X@GRAD"; +constexpr char kOutputs[] = "Out"; class WhileOp : public framework::OperatorBase { public: @@ -71,9 +74,9 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { kCondition, "(Bool) An scalar. When it's False, the While Op will be terminated.") .AsDuplicable(); - AddOutput("Out", + AddOutput(kOutputs, "A set of variables, which will be assigned with values " - "generated by perators inside the block of While Op.") + "generated by the operators inside the block of While Op.") .AsDuplicable(); AddOutput(kStepScopes, "(StepScopeVar) A vector of local scope, which size equals the " @@ -104,17 +107,64 @@ class WhileGradOp : public framework::OperatorBase { auto *step_scopes = scope.FindVar(Input(kStepScopes))->GetMutable(); + auto outside_og_names = Inputs(framework::GradVarName(kOutputs)); + auto inside_og_names = + Attr>("original_output_grad"); + + PADDLE_ENFORCE_EQ(outside_og_names.size(), inside_og_names.size()); + for (auto cur_scope_iter = step_scopes->rbegin(); cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) { + VLOG(3) << "Start backward at time_step " + << cur_scope_iter - step_scopes->rbegin(); + framework::Scope &cur_scope = **cur_scope_iter; + // Link OG from outside to inside + for (size_t i = 0; i < outside_og_names.size(); ++i) { + auto outside_og_name = outside_og_names[i]; + auto inside_og_name = inside_og_names[i]; + VLOG(10) << "Linking outside " << outside_og_name << " --> inside " + << inside_og_name; + auto &og_outside = detail::Ref(scope.FindVar(outside_og_name)); + auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name)); + if (og_outside.Type().hash_code() == + typeid(framework::LoDTensor).hash_code()) { + auto &outside_tensor = og_outside.Get(); + auto &inside_tensor = + detail::Ref(og_inside.GetMutable()); + inside_tensor.set_lod(outside_tensor.lod()); + inside_tensor.ShareDataWith(outside_tensor); + } else if (og_outside.Type().hash_code() == + typeid(framework::LoDTensorArray).hash_code()) { + auto &outside_array = og_outside.Get(); + auto &inside_array = + detail::Ref(og_inside.GetMutable()); + VLOG(10) << outside_og_name << " size = " << outside_array.size(); + inside_array.resize(outside_array.size()); + + for (size_t j = 0; j < inside_array.size(); ++j) { + VLOG(10) << j << " " << outside_array[j].numel(); + if (outside_array[j].numel() != 0) { + inside_array[j].set_lod(outside_array[j].lod()); + inside_array[j].ShareDataWith(outside_array[j]); + } else { + PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0); + } + } + } + } + executor.Run(*program, *cur_scope_iter, block->ID(), false); auto &pg_names = Outputs(kParamGrads); auto &p_names = Inputs(kParameters); PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size()); - for (size_t prog_id = 0; prog_id < pg_names.size(); ++prog_id) { - auto inside_grad_name = framework::GradVarName(p_names[prog_id]); + for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) { + if (pg_names[param_id] == framework::kEmptyVarName) { + continue; // iterator doesn't have gradient + } + auto inside_grad_name = framework::GradVarName(p_names[param_id]); - // // TODO(tonyyang-savil: Not sure we need the following + // // TODO(tonyyang-svail): Not sure we need the following // // If does not compute gradient of that variable inside rnn, // just // // continue @@ -126,7 +176,7 @@ class WhileGradOp : public framework::OperatorBase { // zero gradient variable in step 0 if (cur_scope_iter == step_scopes->rbegin()) { auto *var = (*cur_scope_iter)->FindVar(inside_grad_name); - PADDLE_ENFORCE_NOT_NULL(var); + PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name); if (var->IsType()) { auto &inside_tensor = var->Get(); framework::AttributeMap attrs; @@ -135,27 +185,18 @@ class WhileGradOp : public framework::OperatorBase { attrs["value"] = 0.0f; auto zero_op = framework::OpRegistry::CreateOp( - "fill_constant", {}, {{"Out", {pg_names[prog_id]}}}, attrs); + "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs); zero_op->Run(scope, dev_ctx); } } // sum gradient - auto *outside_var = scope.FindVar(pg_names[prog_id]); - PADDLE_ENFORCE_NOT_NULL(outside_var); - auto &outside_tensor = *outside_var->GetMutable(); - - std::string result_var_name; - auto *local_result_var = (*cur_scope_iter)->Var(&result_var_name); - auto &local_result_tensor = - *local_result_var->GetMutable(); - - local_result_tensor.ShareDataWith(outside_tensor); - + auto new_inside_name = cur_scope.Rename(inside_grad_name); auto sum_op = framework::OpRegistry::CreateOp( - "sum", {{"X", {result_var_name, inside_grad_name}}}, - {{"Out", {result_var_name}}}, {}); - sum_op->Run(**cur_scope_iter, dev_ctx); + "sum", {{"X", {pg_names[param_id], new_inside_name}}}, + {{"Out", {pg_names[param_id]}}}, {}); + sum_op->Run(cur_scope, dev_ctx); + cur_scope.Rename(new_inside_name, inside_grad_name); } } } @@ -169,29 +210,110 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { virtual std::unique_ptr Apply() const { auto *grad = new framework::OpDescBind(); grad->SetType("while_grad"); - for (auto &input_param : this->InputNames()) { - grad->SetInput(input_param, this->Input(input_param)); - grad->SetOutput(framework::GradVarName(input_param), - this->InputGrad(input_param)); + grad->SetInput(kParameters, Input(kParameters)); + grad->SetOutput( + framework::GradVarName(kParameters), + InputGrad(kParameters, /*do not drop empty gradient*/ false)); + grad->SetInput(kOutputs, Output(kOutputs)); + + // OG should be re-calculated by step blocks, since many outputs of while op + // do not need to calculate gradients. + std::unordered_set block_ins; + { + for (auto &p : Input(kParameters)) { + block_ins.insert(p); + } + for (auto &o : Output(kOutputs)) { + block_ins.insert(o); + } } + std::unordered_set extra_inputs; + for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) { + for (auto &input_name : grad_block_[0]->Op(i)->InputArgumentNames()) { + if (block_ins.find(input_name) != block_ins.end()) { + continue; + } + extra_inputs.insert(input_name); + } - for (auto &output_param : this->OutputNames()) { - grad->SetInput(output_param, this->Output(output_param)); - if (output_param != kStepScopes) { - grad->SetInput(framework::GradVarName(output_param), - this->OutputGrad(output_param)); + for (auto &output_name : grad_block_[0]->Op(i)->OutputArgumentNames()) { + block_ins.insert(output_name); } } + + std::vector extra_inputs_list; + extra_inputs_list.resize(extra_inputs.size()); + std::copy(extra_inputs.begin(), extra_inputs.end(), + extra_inputs_list.begin()); + grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list); + grad->SetInput(kStepScopes, Output(kStepScopes)); grad->SetAttrMap(this->Attrs()); grad->SetBlockAttr(kStepBlock, *grad_block_[0]); + // record the original output gradient names, since the gradient name of + // while operator could be renamed. + grad->SetAttr("original_output_grad", extra_inputs_list); return std::unique_ptr(grad); } }; +class WhileGradOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDescBind &op_desc, + framework::BlockDescBind *block) const override { + auto p_names = op_desc.Input(kParameters); + auto pg_names = op_desc.Output(framework::GradVarName(kParameters)); + + for (size_t i = 0; i < p_names.size(); ++i) { + auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i])); + auto *g_var = block->FindVarRecursive(pg_names[i]); + if (g_var != nullptr) { // Gradient could be @EMPTY@ + VLOG(5) << "Setting " << pg_names[i] << " following " << p_names[i] + << " type: " << p_var.GetType(); + g_var->SetType(p_var.GetType()); + g_var->SetDataType(p_var.GetDataType()); + } + } + } +}; + +class WhileGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + ctx->HasInputs(kParameters); + ctx->HasOutputs(framework::GradVarName(kParameters)); + ctx->HasInputs(kOutputs); + ctx->HasInputs(framework::GradVarName(kOutputs)); + + auto p_names = ctx->Inputs(kParameters); + auto pg_names = ctx->Outputs(kParamGrads); + auto dims = ctx->GetInputsDim(kParameters); + auto var_types = ctx->GetInputsVarType(kParameters); + std::vector names_to_set; + std::vector dims_to_set; + for (size_t i = 0; i < p_names.size(); ++i) { + if (pg_names[i] == framework::kEmptyVarName) { + continue; + } + if (var_types[i] == framework::VarDesc::LOD_TENSOR) { + names_to_set.push_back(pg_names[i]); + dims_to_set.push_back(dims[i]); + } else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) { + // not sure how to set the dim of LOD_TENSOR_ARRAY + names_to_set.push_back(pg_names[i]); + dims_to_set.push_back(dims[i]); + } + } + ctx->SetDims(names_to_set, dims_to_set); + } +}; + } // namespace operators } // namespace paddle REGISTER_OPERATOR(while, paddle::operators::WhileOp, paddle::operators::WhileOpMaker, paddle::operators::WhileGradOpDescMaker); +REGISTER_OPERATOR(while_grad, paddle::operators::WhileGradOp, + paddle::operators::WhileGradOpShapeInference, + paddle::operators::WhileGradOpVarTypeInference); diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp index 8b3be062b654a52e667626199be8c8bb4a2a96d7..1898598e49652a2829e57329bab6017304cec662 100644 --- a/paddle/parameter/ParameterUpdateFunctions.cpp +++ b/paddle/parameter/ParameterUpdateFunctions.cpp @@ -30,7 +30,7 @@ void sgdUpdateCpu(real learningRate, const real* grad, real* momentumVec) { decayRate *= learningRate; -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_USE_MKLML #pragma omp parallel for #endif for (size_t i = 0; i < size; ++i) { diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h index ce3421a3cb840e4c1e872eea12dedc1150c85962..dd48605b9ed688e4656d4cd1ddf1f298d0a50a9e 100644 --- a/paddle/platform/cudnn_helper.h +++ b/paddle/platform/cudnn_helper.h @@ -63,9 +63,10 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { } \ } while (false) -enum class DataLayout { +enum class DataLayout { // Not use kNHWC, kNCHW, + kNCDHW, kNCHW_VECT_C, }; @@ -107,12 +108,15 @@ class CudnnDataType { } }; -inline cudnnTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) { +inline cudnnTensorFormat_t GetCudnnTensorFormat( + const DataLayout& order) { // Not use switch (order) { case DataLayout::kNHWC: return CUDNN_TENSOR_NHWC; case DataLayout::kNCHW: return CUDNN_TENSOR_NCHW; + case DataLayout::kNCDHW: + return CUDNN_TENSOR_NCHW; // TODO(chengduoZH) : add CUDNN_TENSOR_NCDHW default: PADDLE_THROW("Unknown cudnn equivalent for order"); } @@ -139,7 +143,7 @@ class ScopedTensorDescriptor { strides[i] = dims[i + 1] * strides[i + 1]; } // Update tensor descriptor dims setting if groups > 1 - // FIXME(typhoonzero): Assume using NCHW order + // FIXME(typhoonzero): Assume using NCHW or NCDHW order std::vector dims_with_group(dims.begin(), dims.end()); // copy if (groups > 1) { dims_with_group[1] = dims_with_group[1] / groups; @@ -176,9 +180,10 @@ class ScopedFilterDescriptor { const cudnnDataType_t type, const std::vector& kernel, const int groups = 1) { - // filter layout: MCHW, where M is the number of + // filter layout: MCHW(MCDHW), where M is the number of // output image channels, C is the number of input image channels, - // H and W is height and width of filter. + // D is the depth of the filter, H is the height of the filter, and W is the + // width of the filter. std::vector kernel_with_group(kernel.begin(), kernel.end()); if (groups > 1) { // M /= groups diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md index b5fd68839ddb62e76f2fd930248d546bc093a892..f3a6f1dba7588c6b29c1dcae26ec134c1a7f937d 100644 --- a/paddle/scripts/docker/README.md +++ b/paddle/scripts/docker/README.md @@ -57,8 +57,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF" | `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. | | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. | | `WITH_TESTING` | ON | Build unit tests binaries. | -| `WITH_MKLDNN` | ON | Build with [Intel® MKL DNN](https://github.com/01org/mkl-dnn) support. | -| `WITH_MKLML` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) support. | +| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. | | `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. | | `WITH_SWIG_PY` | ON | Build with SWIG python API support. | | `WITH_C_API` | OFF | Build capi libraries for inference. | diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index e9c89eee1af1fcc4a7f168af5ec8b16912616687..595d25fd4830b6e69b9a1080803771b0464741db 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -34,8 +34,7 @@ function cmake_gen() { ${PYTHON_FLAGS} -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU:-OFF} - -DWITH_MKLDNN=${WITH_MKLDNN:-ON} - -DWITH_MKLML=${WITH_MKLML:-ON} + -DWITH_MKL=${WITH_MKL:-ON} -DWITH_AVX=${WITH_AVX:-OFF} -DWITH_GOLANG=${WITH_GOLANG:-ON} -DWITH_SWIG_PY=ON @@ -56,8 +55,7 @@ EOF ${PYTHON_FLAGS} \ -DWITH_DOC=OFF \ -DWITH_GPU=${WITH_GPU:-OFF} \ - -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \ - -DWITH_MKLML=${WITH_MKLML:-ON} \ + -DWITH_MKL=${WITH_MKL:-ON} \ -DWITH_AVX=${WITH_AVX:-OFF} \ -DWITH_GOLANG=${WITH_GOLANG:-ON} \ -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \ diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in index b9a49526a7e02131767a4e9b26cd0b53278176d0..d71cb84df3785008ea5793519fc26a174e1b95f7 100755 --- a/paddle/scripts/submit_local.sh.in +++ b/paddle/scripts/submit_local.sh.in @@ -18,8 +18,8 @@ function version(){ echo "PaddlePaddle @PADDLE_VERSION@, compiled with" echo " with_avx: @WITH_AVX@" echo " with_gpu: @WITH_GPU@" + echo " with_mkl: @WITH_MKL@" echo " with_mkldnn: @WITH_MKLDNN@" - echo " with_mklml: @WITH_MKLML@" echo " with_double: @WITH_DOUBLE@" echo " with_python: @WITH_PYTHON@" echo " with_rdma: @WITH_RDMA@" @@ -45,8 +45,8 @@ function ver2num() { function cpu_config() { # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status - # only when MKLDNN or MKLML enabled - if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then + # only when MKL enabled + if [ "@WITH_MKL@" == "OFF" ]; then return 0 fi ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs` @@ -70,8 +70,8 @@ function cpu_config() { function threads_config() { # auto set OMP_NUM_THREADS and MKL_NUM_THREADS # according to trainer_count and total processors - # only when MKLDNN or MKLML enabled - if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then + # only when MKL enabled + if [ "@WITH_MKL@" == "OFF" ]; then return 0 fi processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l` diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh index 973b2736e5ce2b733d52df4f5a270b296bca2cac..28d82343ed32273740d0c52d0451681e43b3675e 100755 --- a/paddle/scripts/travis/build_doc.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -6,7 +6,7 @@ mkdir -p $TRAVIS_BUILD_DIR/build cd $TRAVIS_BUILD_DIR/build # Compile Documentation only. -cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON +cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON make -j `nproc` gen_proto_py make -j `nproc` paddle_docs paddle_docs_cn diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp index b68e29cd5ea223272151e7a8b52d998832f47103..88e684849df6fbfe4042b92bdb76ef98159eecea 100644 --- a/paddle/trainer/Trainer.cpp +++ b/paddle/trainer/Trainer.cpp @@ -137,6 +137,10 @@ void Trainer::init(const std::shared_ptr& config, } } + if (FLAGS_use_mkldnn) { + CHECK_EQ(FLAGS_trainer_count, 1UL) << "MKLDNN only need 1 trainer"; + } + if (testing) { LOG(INFO) << "trainer: in testing mode"; if (config_->getOptConfig().use_sparse_remote_updater() || diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt index f01ad4142d4fe7c7f7d7aac60d967ea114b93e56..80665551ec51214d90b866f0c7b2abb2fdee5f39 100644 --- a/paddle/trainer/tests/CMakeLists.txt +++ b/paddle/trainer/tests/CMakeLists.txt @@ -28,35 +28,7 @@ if(WITH_PYTHON) ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) endif() -################ test_CompareTwoNets ###################### -add_unittest_without_exec(test_CompareTwoNets - test_CompareTwoNets.cpp) -add_test(NAME test_CompareTwoNets - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ - ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets - --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) -############### test_CompareTwoOpts ################### -add_unittest_without_exec(test_CompareTwoOpts - test_CompareTwoOpts.cpp) -add_test(NAME test_CompareTwoOpts - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ - ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts - --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf - --num_passes=1 --need_high_accuracy=0 - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) - -################# test_CompareSparse ################## -add_unittest_without_exec(test_CompareSparse - test_CompareSparse.cpp) -if(NOT ON_TRAVIS) - add_test(NAME test_CompareSparse - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ - ./.set_port.sh -p port -n 6 - ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) -endif() ################# test_recurrent_machine_generation ############### add_unittest_without_exec(test_recurrent_machine_generation test_recurrent_machine_generation.cpp) diff --git a/paddle/trainer/tests/mnist.list b/paddle/trainer/tests/mnist.list deleted file mode 100644 index 703e87753d5a4f507aad11a6d875cea44787667b..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/mnist.list +++ /dev/null @@ -1 +0,0 @@ -trainer/tests/mnist_bin_part diff --git a/paddle/trainer/tests/mnist_bin_part b/paddle/trainer/tests/mnist_bin_part deleted file mode 100644 index 08b93a0ebb5698bdafbc36c3c757918a50bab621..0000000000000000000000000000000000000000 Binary files a/paddle/trainer/tests/mnist_bin_part and /dev/null differ diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data deleted file mode 100644 index f189b21e86a50d70d317b5e43aa2d6e05af5e774..0000000000000000000000000000000000000000 Binary files a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data and /dev/null differ diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist deleted file mode 100644 index 6b406dff0ba91b5f310d7eafa111c0d21d6542c3..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist +++ /dev/null @@ -1 +0,0 @@ -./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data diff --git a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf deleted file mode 100644 index 92f32a18c0068ab4672034a270aa8c52f2716d59..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf +++ /dev/null @@ -1,154 +0,0 @@ -#edit-mode: -*- python -*- -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later. - -# Note: when making change to this file, please make sure -# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest -# for comparing these two nets can pass (test_CompareTwoNets) - -default_initial_std(0.1) -default_device(0) - -word_dim = 999 -l1 = 0 -l2 = 0 - -model_type("nn") - -sparse_update = get_config_arg("sparse_update", bool, False) - -TrainData(ProtoData( - type = "proto_sequence", - files = ('trainer/tests/train_sparse.list'), - )) - -Settings( - algorithm='sgd', - batch_size=100, - learning_rate=0.0001, - learning_rate_decay_a=4e-08, - learning_rate_decay_b=0.0, - learning_rate_schedule='poly', -) - - -wordvec_dim = 32 -layer2_dim = 16 -layer3_dim = 16 -hidden_dim = 32 - -slot_names = ["qb", "qw", "tb", "tw"] - -def ltr_network(network_name, - word_dim=word_dim, - wordvec_dim=wordvec_dim, - layer2_dim=layer2_dim, - layer3_dim=layer3_dim, - hidden_dim=hidden_dim, - slot_names=slot_names, - l1=l1, - l2=l2): - - slotnum = len(slot_names) - for i in xrange(slotnum): - Inputs(slot_names[i] + network_name) - for i in xrange(slotnum): - Layer( - name = slot_names[i] + network_name, - type = "data", - size = word_dim, - device = -1, - ) - Layer( - name = slot_names[i] + "_embedding_" + network_name, - type = "mixed", - size = wordvec_dim, - bias = False, - device = -1, - inputs = TableProjection(slot_names[i] + network_name, - parameter_name = "embedding.w0", - decay_rate_l1=l1, - sparse_remote_update = True, - sparse_update = sparse_update, - ), - ) - Layer( - name = slot_names[i] + "_rnn1_" + network_name, - type = "recurrent", - active_type = "tanh", - bias = Bias(initial_std = 0, - parameter_name = "rnn1.bias"), - inputs = Input(slot_names[i] + "_embedding_" + network_name, - parameter_name = "rnn1.w0") - ) - Layer( - name = slot_names[i] + "_rnnlast_" + network_name, - type = "seqlastins", - inputs = [ - slot_names[i] + "_rnn1_" + network_name, - ], - ) - - Layer( - name = "layer2_" + network_name, - type = "fc", - active_type = "tanh", - size = layer2_dim, - bias = Bias(parameter_name = "layer2.bias"), - inputs = [Input(slot_name + "_rnnlast_" + network_name, - parameter_name = "_layer2_" + slot_name + ".w", - decay_rate = l2, - initial_smart = True) for slot_name in slot_names] - ) - Layer( - name = "layer3_" + network_name, - type = "fc", - active_type = "tanh", - size = layer3_dim, - bias = Bias(parameter_name = "layer3.bias"), - inputs = [ - Input("layer2_" + network_name, - parameter_name = "_layer3.w", - decay_rate = l2, - initial_smart = True), - ] - ) - Layer( - name = "output_" + network_name, - type = "fc", - size = 1, - bias = False, - inputs = [ - Input("layer3_" + network_name, - parameter_name = "_layerO.w"), - ], - ) - - -ltr_network("left") -ltr_network("right") -Inputs("label") -Layer( - name = "label", - type = "data", - size = 1, - ) -Outputs("cost", "qb_rnnlast_left") -Layer( - name = "cost", - type = "rank-cost", - inputs = ["output_left", "output_right", "label"], - ) diff --git a/paddle/trainer/tests/sample_trainer_config_opt_a.conf b/paddle/trainer/tests/sample_trainer_config_opt_a.conf deleted file mode 100644 index b1744db8d604c88ec47e7104f79b38bb9d0e4442..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/sample_trainer_config_opt_a.conf +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -################################### Data Configuration ################################### -TrainData(ProtoData(files = "trainer/tests/mnist.list")) -################################### Algorithm Configuration ################################### -settings(batch_size = 1000, - learning_method = MomentumOptimizer(momentum=0.5, sparse=False)) -################################### Network Configuration ################################### -data = data_layer(name ="input", size=784) - -fc1 = fc_layer(input=data, size=800, - bias_attr=True, - act=SigmoidActivation()) - -fc2 = fc_layer(input=fc1, size=800, - bias_attr=True, - act=SigmoidActivation()) - -output = fc_layer(input=[fc1, fc2], size=10, - bias_attr=True, - act=SoftmaxActivation()) - -lbl = data_layer(name ="label", size=1) - -cost = classification_cost(input=output, label=lbl) -outputs(cost) diff --git a/paddle/trainer/tests/sample_trainer_config_opt_b.conf b/paddle/trainer/tests/sample_trainer_config_opt_b.conf deleted file mode 100644 index b1744db8d604c88ec47e7104f79b38bb9d0e4442..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/sample_trainer_config_opt_b.conf +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -################################### Data Configuration ################################### -TrainData(ProtoData(files = "trainer/tests/mnist.list")) -################################### Algorithm Configuration ################################### -settings(batch_size = 1000, - learning_method = MomentumOptimizer(momentum=0.5, sparse=False)) -################################### Network Configuration ################################### -data = data_layer(name ="input", size=784) - -fc1 = fc_layer(input=data, size=800, - bias_attr=True, - act=SigmoidActivation()) - -fc2 = fc_layer(input=fc1, size=800, - bias_attr=True, - act=SigmoidActivation()) - -output = fc_layer(input=[fc1, fc2], size=10, - bias_attr=True, - act=SoftmaxActivation()) - -lbl = data_layer(name ="label", size=1) - -cost = classification_cost(input=output, label=lbl) -outputs(cost) diff --git a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf b/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf deleted file mode 100644 index d19222360c2f424ddb306b155dfef07921098a6b..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf +++ /dev/null @@ -1,154 +0,0 @@ -#edit-mode: -*- python -*- -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later. - -# Note: when making change to this file, please make sure -# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest -# for comparing these two nets can pass (test_CompareTwoNets) - -default_initial_std(0.1) -default_device(0) - -word_dim = 1451594 -l1 = 0 -l2 = 0 - -model_type("nn") - -sparse_update = get_config_arg("sparse_update", bool, False) - -TrainData(ProtoData( - type = "proto_sequence", - files = ('trainer/tests/train.list'), - )) - -Settings( - algorithm='sgd', - batch_size=100, - learning_rate=0.0001, - learning_rate_decay_a=4e-08, - learning_rate_decay_b=0.0, - learning_rate_schedule='poly', -) - - -wordvec_dim = 128 -layer2_dim = 96 -layer3_dim = 96 -hidden_dim = 128 - -slot_names = ["qb", "qw", "tb", "tw"] - -def ltr_network(network_name, - word_dim=word_dim, - wordvec_dim=wordvec_dim, - layer2_dim=layer2_dim, - layer3_dim=layer3_dim, - hidden_dim=hidden_dim, - slot_names=slot_names, - l1=l1, - l2=l2): - - slotnum = len(slot_names) - for i in xrange(slotnum): - Inputs(slot_names[i] + network_name) - for i in xrange(slotnum): - Layer( - name = slot_names[i] + network_name, - type = "data", - size = word_dim, - device = -1, - ) - Layer( - name = slot_names[i] + "_embedding_" + network_name, - type = "mixed", - size = wordvec_dim, - bias = False, - device = -1, - inputs = TableProjection(slot_names[i] + network_name, - parameter_name = "embedding.w0", - decay_rate_l1=l1, - sparse_remote_update = True, - sparse_update = sparse_update, - ), - ) - Layer( - name = slot_names[i] + "_rnn1_" + network_name, - type = "recurrent", - active_type = "tanh", - bias = Bias(initial_std = 0, - parameter_name = "rnn1.bias"), - inputs = Input(slot_names[i] + "_embedding_" + network_name, - parameter_name = "rnn1.w0") - ) - Layer( - name = slot_names[i] + "_rnnlast_" + network_name, - type = "seqlastins", - inputs = [ - slot_names[i] + "_rnn1_" + network_name, - ], - ) - - Layer( - name = "layer2_" + network_name, - type = "fc", - active_type = "tanh", - size = layer2_dim, - bias = Bias(parameter_name = "layer2.bias"), - inputs = [Input(slot_name + "_rnnlast_" + network_name, - parameter_name = "_layer2_" + slot_name + ".w", - decay_rate = l2, - initial_smart = True) for slot_name in slot_names] - ) - Layer( - name = "layer3_" + network_name, - type = "fc", - active_type = "tanh", - size = layer3_dim, - bias = Bias(parameter_name = "layer3.bias"), - inputs = [ - Input("layer2_" + network_name, - parameter_name = "_layer3.w", - decay_rate = l2, - initial_smart = True), - ] - ) - Layer( - name = "output_" + network_name, - type = "fc", - size = 1, - bias = False, - inputs = [ - Input("layer3_" + network_name, - parameter_name = "_layerO.w"), - ], - ) - - -ltr_network("left") -ltr_network("right") -Inputs("label") -Layer( - name = "label", - type = "data", - size = 1, - ) -Outputs("cost", "qb_rnnlast_left") -Layer( - name = "cost", - type = "rank-cost", - inputs = ["output_left", "output_right", "label"], - ) diff --git a/paddle/trainer/tests/sample_trainer_config_rnn.conf b/paddle/trainer/tests/sample_trainer_config_rnn.conf deleted file mode 100644 index b720d4d5a6ca59e207832a8c5410c2cb6074c439..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/sample_trainer_config_rnn.conf +++ /dev/null @@ -1,180 +0,0 @@ -#edit-mode: -*- python -*- -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later. - -# Note: when making change to this file, please make sure -# sample_trainer_config_qb_rnn.conf is changed accordingly so that the uniitest -# for comparing these two nets can pass (test_CompareTwoNets) - -default_initial_std(0.1) -default_device(0) - -word_dim = 1451594 -l1 = 0 -l2 = 0 - -model_type("recurrent_nn") - -sparse_update = get_config_arg("sparse_update", bool, False) - -TrainData(ProtoData( - type = "proto_sequence", - files = ('trainer/tests/train.list'), - )) - -Settings( - algorithm='sgd', - batch_size=100, - learning_rate=0.0001, - learning_rate_decay_a=4e-08, - learning_rate_decay_b=0.0, - learning_rate_schedule='poly', -) - - -wordvec_dim = 128 -layer2_dim = 96 -layer3_dim = 96 -hidden_dim = 128 - -slot_names = ["qb", "qw", "tb", "tw"] - -def SimpleRecurrentLayer(name, - size, - active_type, - bias, - input_layer_name, - parameter_name, - seq_reversed = False): - RecurrentLayerGroupBegin(name + "_layer_group", - in_links=[input_layer_name], - out_links=[name], - seq_reversed=seq_reversed) - memory_name = Memory(name=name, size=size) - Layer( - name = name, - type = "mixed", - size = size, - active_type = active_type, - bias = bias, - inputs = [IdentityProjection(input_layer_name), - FullMatrixProjection(memory_name, - parameter_name = parameter_name, - ), - ] - ) - RecurrentLayerGroupEnd(name + "_layer_group") - - -def ltr_network(network_name, - word_dim=word_dim, - wordvec_dim=wordvec_dim, - layer2_dim=layer2_dim, - layer3_dim=layer3_dim, - hidden_dim=hidden_dim, - slot_names=slot_names, - l1=l1, - l2=l2): - - slotnum = len(slot_names) - for i in xrange(slotnum): - Inputs(slot_names[i] + network_name) - for i in xrange(slotnum): - Layer( - name = slot_names[i] + network_name, - type = "data", - size = word_dim, - device = -1, - ) - Layer( - name = slot_names[i] + "_embedding_" + network_name, - type = "mixed", - size = wordvec_dim, - bias = False, - device = -1, - inputs = TableProjection(slot_names[i] + network_name, - parameter_name = "embedding.w0", - decay_rate_l1=l1, - sparse_remote_update = True, - sparse_update = sparse_update, - ), - ) - SimpleRecurrentLayer( - name = slot_names[i] + "_rnn1_" + network_name, - size = hidden_dim, - active_type = "tanh", - bias = Bias(initial_std = 0, - parameter_name = "rnn1.bias"), - input_layer_name = slot_names[i] + "_embedding_" + network_name, - parameter_name = "rnn1.w0", - ) - Layer( - name = slot_names[i] + "_rnnlast_" + network_name, - type = "seqlastins", - inputs = [ - slot_names[i] + "_rnn1_" + network_name, - ], - ) - Layer( - name = "layer2_" + network_name, - type = "fc", - active_type = "tanh", - size = layer2_dim, - bias = Bias(parameter_name = "layer2.bias"), - inputs = [Input(slot_name + "_rnnlast_" + network_name, - parameter_name = "_layer2_" + slot_name + ".w", - decay_rate = l2, - initial_smart = True) for slot_name in slot_names] - ) - Layer( - name = "layer3_" + network_name, - type = "fc", - active_type = "tanh", - size = layer3_dim, - bias = Bias(parameter_name = "layer3.bias"), - inputs = [ - Input("layer2_" + network_name, - parameter_name = "_layer3.w", - decay_rate = l2, - initial_smart = True), - ] - ) - Layer( - name = "output_" + network_name, - type = "fc", - size = 1, - bias = False, - inputs = [ - Input("layer3_" + network_name, - parameter_name = "_layerO.w"), - ], - ) - - -ltr_network("left") -ltr_network("right") -Inputs("label") -Layer( - name = "label", - type = "data", - size = 1, - ) -Outputs("cost", "qb_rnnlast_left") -Layer( - name = "cost", - type = "rank-cost", - inputs = ["output_left", "output_right", "label"], - ) diff --git a/paddle/trainer/tests/testPyDataWrapper.py b/paddle/trainer/tests/testPyDataWrapper.py index 2c29a274339747b78fbd6c27ae4070f0abbd4028..a76eeeacb91cdba305d2f71c6292f79e4b98dd73 100644 --- a/paddle/trainer/tests/testPyDataWrapper.py +++ b/paddle/trainer/tests/testPyDataWrapper.py @@ -20,28 +20,6 @@ import random import json import string - -@provider(slots=[ - SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1), - IndexSlot(3) -]) -def processNonSequenceData(obj, filename): - with open(filename, "rb") as f: - for line in f: - slots_str = line.split(';') - index = int(slots_str[0]) - non_values = map(int, slots_str[1].split()[1:]) - dense = map(float, slots_str[2].split()[1:]) - strs = slots_str[4].strip().split(' ', 1)[1] - - def __values_mapper__(s): - s = s.split(":") - return int(s[0]), float(s[1]) - - values = map(__values_mapper__, slots_str[3].split()[1:]) - yield [non_values, dense, values, strs, index] - - SPARSE_ID_LIMIT = 1000 SPARSE_ID_COUNT = 100 SEQUENCE_LIMIT = 50 @@ -146,8 +124,6 @@ def processSubSeqAndGenerateData(obj, name): if __name__ == "__main__": - pvd = processNonSequenceData("test.txt") - print pvd.getNextBatch(100) pvd = processSeqAndGenerateData("_") print pvd.getNextBatch(100) pvd = processSubSeqAndGenerateData("_") diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp deleted file mode 100644 index 383505f8131264844069d6f0fa13f4e0ac1f97af..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/test_CompareTwoOpts.cpp +++ /dev/null @@ -1,184 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include - -#include "paddle/trainer/Trainer.h" - -using namespace paddle; // NOLINT -using namespace std; // NOLINT - -DECLARE_int32(gpu_id); - -DECLARE_bool(local); -DECLARE_bool(use_gpu); - -DECLARE_string(config); -DECLARE_string(nics); - -DEFINE_string(config_file_a, "", "config of one network to compare"); -DEFINE_string(config_file_b, "", "config of another network to compare"); -DEFINE_bool(need_high_accuracy, - true, - "whether need to run in double accuracy (recommended)"); -DEFINE_double( - max_diff_ratio, - 0.0f, - "max diff ratio allowed for outputs and parameters (value/gradient)"); - -struct ComData { - vector outArgs; - vector parameters; -}; - -void calcGradient(ComData& data, const string configFile) { - FLAGS_config = configFile; - - FLAGS_local = true; - FLAGS_use_gpu = false; - - FLAGS_nics = ""; - - *ThreadLocalRand::getSeed() = 0; - srand(0); - - Trainer trainer; - trainer.init(TrainerConfigHelper::createFromFlagConfig(), false); - - data.parameters = trainer.getGradientMachine()->getParameters(); - trainer.getDataProvider()->setSkipShuffle(); - trainer.train(); -} - -void checkBuffer(real* A, - const char* desA, - real* B, - const char* desB, - size_t len, - size_t width = 1) { - int nNum = 0; - for (size_t i = 0; i < len; ++i) { - real diff = fabs(A[i] - B[i]); - if (diff > 0.0f && - diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_max_diff_ratio) { - nNum++; - LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i] - << " " << desB << " : " << B[i]; - } - } - EXPECT_EQ(0, nNum); - LOG(INFO) << "\n\n"; -} - -void compareGradient(ComData& comDataA, ComData& comDataB) { - vector outArgsA = comDataA.outArgs; - vector outArgsB = comDataB.outArgs; - - for (size_t i = 0; i < outArgsA.size(); ++i) { - CpuMatrix matA(outArgsA[i].value->getHeight(), - outArgsA[i].value->getWidth()); - CpuMatrix matB(outArgsB[i].value->getHeight(), - outArgsB[i].value->getWidth()); - - matA.copyFrom(*outArgsA[i].value); - matB.copyFrom(*outArgsB[i].value); - - LOG(INFO) << "\n--------------------------------" - << " Check Network Output_" << i << ":" - << " -------------------------------------\n"; - checkBuffer(matA.getData(), - "network A output", - matB.getData(), - "network B output", - matA.getElementCnt(), - matA.getWidth()); - } - - vector& parametersA = comDataA.parameters; - vector& parametersB = comDataB.parameters; - - LOG(INFO) << "\n\n--------------------------------" - << " Check Gradient Machine Parameters:" - << " -------------------------------------\n"; - for (size_t i = 0; i < parametersA.size(); ++i) { - ParameterPtr parameterA, parameterB; - parameterA = parametersA[i]; - parameterB = parametersB[i]; - - CpuVector paraA(parameterA->getSize()); - CpuVector paraB(parameterB->getSize()); - paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE)); - paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE)); - - LOG(INFO) << "\n\n----------- PARAMETER_VALUE: " << parameterA->getName() - << " ; size : " << paraA.getSize() << " ------------"; - checkBuffer(paraA.getData(), - "Network A", - paraB.getData(), - "Network B", - paraA.getSize()); - - CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT)); - CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT)); - - LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName() - << " ; size : " << gradA.getSize() << " -----------"; - checkBuffer(gradA.getData(), - "Network A", - gradB.getData(), - "Network B", - gradA.getSize()); - } -} - -TEST(Trainer, create) { - ComData dataA; - calcGradient(dataA, FLAGS_config_file_a); - LOG(INFO) << "\n\ntraining of Network A is finished\n\n"; - - ComData dataB; - calcGradient(dataB, FLAGS_config_file_b); - LOG(INFO) << "\n\ntraining of the Network B is finished\n\n"; - - compareGradient(dataA, dataB); -} - -int main(int argc, char** argv) { - paddle::initMain(argc, argv); - testing::InitGoogleTest(&argc, argv); - initPython(argc, argv); - -#ifndef PADDLE_TYPE_DOUBLE - if (FLAGS_need_high_accuracy) { - LOG(INFO) << "skip test due to it's need high accuracy"; - return 0; - } - if (FLAGS_max_diff_ratio == 0.0f) { - FLAGS_max_diff_ratio = 2e-4; - LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio - << " in low accuracy mode"; - } -#else - if (FLAGS_max_diff_ratio == 0.0f) { - FLAGS_max_diff_ratio = 2e-7; - LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio - << " in high accuracy mode"; - } -#endif - int ret = RUN_ALL_TESTS(); - return ret; -} diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp index 66ec65e340a435a7260028611828fb28845e0728..92dc8aa9ec5ce281d1950d84260c1b9555e686a7 100644 --- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp +++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp @@ -25,45 +25,9 @@ limitations under the License. */ #include #include "picojson.h" -void checkEqual(const paddle::Argument& expect, const paddle::Argument& actual); void checkValue(std::vector& arguments, picojson::array& arr); const std::string kDir = "./trainer/tests/pydata_provider_wrapper_dir/"; -TEST(PyDataProviderWrapper, NoSequenceData) { - paddle::DataConfig conf; - conf.set_type("py"); - conf.set_load_data_module(std::string("testPyDataWrapper")); - conf.set_load_data_object(std::string("processNonSequenceData")); - conf.set_async_load_data(false); - conf.clear_files(); - conf.set_files(kDir + "test_pydata_provider_wrapper.list"); - paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false)); - provider->setSkipShuffle(); - provider->reset(); - paddle::DataBatch batchFromPy; - provider->getNextBatch(100, &batchFromPy); - - paddle::DataConfig conf2; - conf2.set_type("proto"); - conf2.set_async_load_data(false); - conf2.clear_files(); - conf2.set_files(kDir + "test_pydata_provider_wrapper.protolist"); - - provider.reset(paddle::DataProvider::create(conf2, false)); - provider->setSkipShuffle(); - provider->reset(); - paddle::DataBatch batchFromProto; - provider->getNextBatch(100, &batchFromProto); - - std::vector& pyArguments = batchFromPy.getStreams(); - std::vector& protoArguments = batchFromProto.getStreams(); - EXPECT_EQ(pyArguments.size(), protoArguments.size()); - - for (size_t i = 0; i < pyArguments.size(); ++i) { - checkEqual(protoArguments[i], pyArguments[i]); - } -} - TEST(PyDataProviderWrapper, SequenceData) { paddle::DataConfig conf; conf.set_type("py"); @@ -148,66 +112,6 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -void checkEqual(const paddle::Argument& expect, - const paddle::Argument& actual) { - if (expect.value) { - EXPECT_TRUE(actual.value != nullptr); - paddle::Matrix* e = expect.value.get(); - paddle::Matrix* a = actual.value.get(); - EXPECT_EQ(e->getWidth(), a->getWidth()); - EXPECT_EQ(e->getHeight(), a->getHeight()); - if (dynamic_cast(e)) { - paddle::CpuSparseMatrix* se = dynamic_cast(e); - paddle::CpuSparseMatrix* sa = dynamic_cast(a); - EXPECT_EQ(se->getFormat(), sa->getFormat()); - EXPECT_EQ(se->getElementCnt(), sa->getElementCnt()); - size_t rowSize = se->getFormat() == paddle::SPARSE_CSC - ? se->getElementCnt() - : se->getHeight() + 1; - size_t colSize = se->getFormat() == paddle::SPARSE_CSC - ? se->getWidth() + 1 - : se->getElementCnt(); - for (size_t i = 0; i < rowSize; ++i) { - EXPECT_EQ(se->getRows()[i], sa->getRows()[i]); - } - for (size_t i = 0; i < colSize; ++i) { - EXPECT_EQ(se->getCols()[i], sa->getCols()[i]); - } - if (se->getValueType() == paddle::FLOAT_VALUE) { - EXPECT_EQ(paddle::FLOAT_VALUE, sa->getValueType()); - for (size_t i = 0; i < se->getElementCnt(); ++i) { - EXPECT_EQ(se->getValue()[i], sa->getValue()[i]); - } - } - } else if (dynamic_cast(e)) { - EXPECT_EQ(e->getElementCnt(), a->getElementCnt()); - for (size_t i = 0; i < e->getElementCnt(); ++i) { - EXPECT_EQ(e->getData()[i], a->getData()[i]); - } - } - } - - if (expect.ids) { - EXPECT_TRUE(actual.ids != nullptr); - paddle::VectorT* e = expect.ids.get(); - paddle::VectorT* a = actual.ids.get(); - EXPECT_EQ(e->getSize(), a->getSize()); - for (size_t i = 0; i < e->getSize(); ++i) { - EXPECT_EQ(e->getData()[i], a->getData()[i]); - } - } - - if (expect.strs) { - EXPECT_TRUE(actual.strs != nullptr); - std::vector* e = expect.strs.get(); - std::vector* a = actual.strs.get(); - EXPECT_EQ(e->size(), a->size()); - for (size_t i = 0; i < e->size(); ++i) { - EXPECT_EQ((*e)[i], (*a)[i]); - } - } -} - void checkValue(std::vector& arguments, picojson::array& arr) { // CHECK SLOT 0, Sparse Value. diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 5bd68e211ac1c8e05f40dc3ca37eef99f32af47f..d6128dd7692a2faebf453d239744c4893d84e369 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1826,7 +1826,7 @@ class FCLayer(LayerBase): self.layer_type = 'mkldnn_fc' config_assert( len(inputs) == 1, - "MkldnnFCLayer support one and only one input!") + "MKLDNNFCLayer support one and only one input!") super(FCLayer, self).__init__( name, self.layer_type, size, inputs=inputs, **xargs) for input_index in xrange(len(self.inputs)): @@ -1837,7 +1837,7 @@ class FCLayer(LayerBase): sparse = format == "csr" or format == "csc" if use_mkldnn: config_assert(not sparse, - "MkldnnFCLayer do not support sparse format yet") + "MKLDNNFCLayer do not support sparse format yet") if use_mkldnn_wgt: dims = [self.config.size, input_layer.size] if sparse: @@ -1853,7 +1853,7 @@ class FCLayer(LayerBase): @config_layer('mkldnn_fc') -class MkldnnFcLayer(FCLayer): +class MKLDNNFcLayer(FCLayer): layer_type = 'mkldnn_fc' @@ -3209,6 +3209,18 @@ class SubNestedSequenceLayer(LayerBase): self.set_layer_size(size) +@config_layer('dot_prod') +class DotProdLayer(LayerBase): + def __init__(self, name, inputs, device=None): + super(DotProdLayer, self).__init__( + name, 'dot_prod', 0, inputs, device=device) + config_assert(len(inputs) == 2, 'DotProdLayer must have 2 inputs.') + config_assert( + self.get_input_layer(0).size == self.get_input_layer(1).size, + "Two inputs should have the same size.") + self.set_layer_size(1) + + @config_layer('out_prod') class OuterProdLayer(LayerBase): def __init__(self, name, inputs, device=None): @@ -3506,11 +3518,17 @@ def ExpressionLayer(name, inputs, **xargs): @config_layer('concat') class ConcatenateLayer(LayerBase): + layer_type = 'concat' + def __init__(self, name, inputs, bias=False, **xargs): config_assert(inputs, 'inputs cannot be empty') config_assert(not bias, 'ConcatenateLayer cannot support bias.') + use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0))) + if self.layer_type == "mkldnn_concat": + config_assert(use_mkldnn, "mkldnn_concat only support MKLDNN") + self.layer_type = 'mkldnn_concat' if use_mkldnn else 'concat' super(ConcatenateLayer, self).__init__( - name, 'concat', 0, inputs=inputs, **xargs) + name, self.layer_type, 0, inputs=inputs, **xargs) size = 0 for input_index in xrange(len(self.inputs)): assert self.get_input_layer(0).height == self.get_input_layer( @@ -3530,6 +3548,11 @@ class ConcatenateLayer(LayerBase): self.set_layer_size(size) +@config_layer('mkldnn_concat') +class MKLDNNConcatLayer(ConcatenateLayer): + layer_type = 'mkldnn_concat' + + # like concat layer, but each input layer was processed by a Projection. @config_layer('concat2') class ConcatenateLayer2(LayerBase): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index a02eba007ddf929ff92df995df253f5a386bac7b..388535d53a9d1d6747ac89cb698f3a1f496b5f7c 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -115,6 +115,7 @@ __all__ = [ 'huber_classification_cost', 'block_expand_layer', 'maxout_layer', + 'dot_prod_layer', 'out_prod_layer', 'printer_layer', 'print_layer', @@ -197,6 +198,7 @@ class LayerType(object): SCALING_LAYER = 'scaling' TRANS_LAYER = 'trans' ROTATE_LAYER = 'rotate' + DOT_PROD_LAYER = 'dot_prod' OUT_PROD_LAYER = 'out_prod' FEATURE_MAP_EXPAND_LAYER = 'featmap_expand' @@ -4140,6 +4142,45 @@ def maxid_layer(input, name=None, layer_attr=None): size=l.config.size) +@wrap_name_default() +def dot_prod_layer(input1, input2, name=None, layer_attr=None): + """ + A layer for computing the dot product of two vectors. + + The example usage is: + + .. code-block:: python + + dot_prod = dot_prod_layer(input1=vec1, input2=vec2) + + :param name: The name of this layer. It is optional. + :type name: basestring + :param input1: The first input layer. + :type input: LayerOutput + :param input2: The second input layer. + :type input2: LayerOutput + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. + :type layer_attr: ExtraLayerAttribute. + :return: LayerOutput object. + :rtype: LayerOutput + """ + assert isinstance(input1, LayerOutput) + assert isinstance(input2, LayerOutput) + assert input1.size == input2.size, ("Two inputs should have the same size.") + + l = Layer( + name=name, + type=LayerType.DOT_PROD_LAYER, + inputs=[input1.name, input2.name], + **ExtraLayerAttribute.to_kwargs(layer_attr)) + return LayerOutput( + name=name, + layer_type=LayerType.DOT_PROD_LAYER, + parents=[input1, input2], + size=l.config.size) + + @wrap_name_default() def out_prod_layer(input1, input2, name=None, layer_attr=None): """ diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh index 1c7451e0abf5dc1b99671f292e2ffc2d2282abe9..0b269a1ff76530774b4d23b0867350fd95e081a3 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh @@ -10,6 +10,7 @@ test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_la test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer -test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_scale_sub_region_layer) +test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_scale_sub_region_layer +test_dot_prod_layer) export whole_configs=(test_split_datasource) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr new file mode 100644 index 0000000000000000000000000000000000000000..f1530c382c3d81a82592af2c43c06eb4278e2b4a --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr @@ -0,0 +1,38 @@ +type: "nn" +layers { + name: "vector1" + type: "data" + size: 10 + active_type: "" +} +layers { + name: "vector2" + type: "data" + size: 10 + active_type: "" +} +layers { + name: "__dot_prod_layer_0__" + type: "dot_prod" + size: 1 + active_type: "" + inputs { + input_layer_name: "vector1" + } + inputs { + input_layer_name: "vector2" + } +} +input_layer_names: "vector1" +input_layer_names: "vector2" +output_layer_names: "__dot_prod_layer_0__" +sub_models { + name: "root" + layer_names: "vector1" + layer_names: "vector2" + layer_names: "__dot_prod_layer_0__" + input_layer_names: "vector1" + input_layer_names: "vector2" + output_layer_names: "__dot_prod_layer_0__" + is_recurrent_layer_group: false +} diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..e52d48dde0084aacd3f7874cc384d59287a0c7d5 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py @@ -0,0 +1,7 @@ +from paddle.trainer_config_helpers import * + +vec1 = data_layer(name='vector1', size=10) +vec2 = data_layer(name='vector2', size=10) +dot_product = dot_prod_layer(input1=vec1, input2=vec2) + +outputs(dot_product) diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index f20567243ae67baecbdbac13f879f4cf2f66d298..acca6ba35ced8674d4eec7dc57e41673c90cf8f8 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -4,7 +4,10 @@ import collections import numpy as np import copy -__all__ = ['Block', 'Variable', 'Program', 'Operator', 'default_startup_program', 'default_main_program'] +__all__ = [ + 'Block', 'Variable', 'Program', 'Operator', 'default_startup_program', + 'default_main_program' +] def unique_name(prefix): @@ -12,9 +15,9 @@ def unique_name(prefix): return "_".join([prefix, str(uid)]) -def _debug_string_(proto): +def _debug_string_(proto, throw_on_error=True): error_fields = list() - if not proto.IsInitialized(error_fields): + if not proto.IsInitialized(error_fields) and throw_on_error: raise ValueError("{0} are not initialized\nThe message is {1}".format( error_fields, proto)) return proto.__str__() @@ -101,9 +104,12 @@ class Variable(object): self.stop_gradient = stop_gradient def __str__(self): + return self.to_string(True) + + def to_string(self, throw_on_error): protostr = self.desc.serialize_to_string() proto = framework_pb2.VarDesc.FromString(str(protostr)) - return _debug_string_(proto) + return _debug_string_(proto, throw_on_error) __repr__ = __str__ @@ -229,17 +235,17 @@ class Operator(object): in_proto.name) if found: - in_argus = inputs[in_proto.name] - if not isinstance(in_argus, list): - in_argus = [in_argus] - if not in_proto.duplicable and len(in_argus) > 1: + in_args = inputs[in_proto.name] + if not isinstance(in_args, list): + in_args = [in_args] + if not in_proto.duplicable and len(in_args) > 1: raise ValueError( "Input %s expects only one input, but %d are given." - % (in_proto.name, len(in_argus))) - in_argu_names = [] - for argu in in_argus: - in_argu_names.append(argu.name) - self.desc.set_input(in_proto.name, in_argu_names) + % (in_proto.name, len(in_args))) + in_arg_names = [] + for arg in in_args: + in_arg_names.append(arg.name) + self.desc.set_input(in_proto.name, in_arg_names) else: self.desc.set_input(in_proto.name, []) @@ -257,18 +263,18 @@ class Operator(object): str(e) for e in given))) for out_proto in proto.outputs: - out_argus = outputs[out_proto.name] - if not isinstance(out_argus, list): - out_argus = [out_argus] - if not out_proto.duplicable and len(out_argus) > 1: + out_args = outputs[out_proto.name] + if not isinstance(out_args, list): + out_args = [out_args] + if not out_proto.duplicable and len(out_args) > 1: raise ValueError( "Output %s expects only one output, but %d are given." % - (out_proto.name, len(out_argus))) - out_argu_names = [] - for argu in out_argus: - out_argu_names.append(argu.name) - argu.op = self - self.desc.set_output(out_proto.name, out_argu_names) + (out_proto.name, len(out_args))) + out_arg_names = [] + for arg in out_args: + out_arg_names.append(arg.name) + arg.op = self + self.desc.set_output(out_proto.name, out_arg_names) if attrs is not None: if not isinstance(attrs, dict): @@ -291,10 +297,13 @@ class Operator(object): self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) - def __str__(self): + def to_string(self, throw_on_error): protostr = self.desc.serialize_to_string() proto = framework_pb2.OpDesc.FromString(str(protostr)) - return _debug_string_(proto) + return _debug_string_(proto, throw_on_error) + + def __str__(self): + return self.to_string(True) __repr__ = __str__ @@ -349,9 +358,12 @@ class Block(object): self.program = program def __str__(self): + return self.to_string(True) + + def to_string(self, throw_on_error): protostr = self.desc.serialize_to_string() proto = framework_pb2.BlockDesc.FromString(str(protostr)) - return _debug_string_(proto) + return _debug_string_(proto, throw_on_error) __repr__ = __str__ @@ -454,9 +466,12 @@ class Program(object): self.current_block_idx = 0 def __str__(self): + return self.to_string(True) + + def to_string(self, throw_on_error): protostr = self.desc.serialize_to_string() proto = framework_pb2.ProgramDesc.FromString(str(protostr)) - return _debug_string_(proto) + return _debug_string_(proto, throw_on_error) def clone(self): p = Program() @@ -512,7 +527,14 @@ class Program(object): assert isinstance(target, Variable) if no_grad_set is None: no_grad_set = set() - param_to_grad_info = self.desc.append_backward(target.desc, no_grad_set) + try: + param_to_grad_info = self.desc.append_backward(target.desc, + no_grad_set) + except Exception as e: + raise core.EnforceNotMet( + str(e) + "\nCurrent protobuf is\n{0}".format( + self.to_string(False))) + self.sync_with_cpp() return param_to_grad_info @@ -563,8 +585,10 @@ class Parameter(Variable): g_main_program = Program() g_startup_program = Program() + def default_startup_program(): return g_startup_program + def default_main_program(): return g_main_program diff --git a/python/paddle/v2/fluid/net_drawer.py b/python/paddle/v2/fluid/net_drawer.py index 17ad547c2bb5b79ef8225dd1a8f1ef49a6572508..94fdd5e38970b309580de6fc934b158a3c46e464 100644 --- a/python/paddle/v2/fluid/net_drawer.py +++ b/python/paddle/v2/fluid/net_drawer.py @@ -66,10 +66,13 @@ def parse_graph(program, graph, var_dict, **kwargs): if not var_dict.has_key(var): var_dict[var] = "Feed" + temp_id = 0 proto = framework_pb2.ProgramDesc.FromString( program.desc.serialize_to_string()) for block in proto.blocks: for op in block.ops: + op.type = op.type + "_" + str(temp_id) + temp_id += 1 graph.node(**draw_node(op)) for o in op.outputs: for arg in o.arguments: @@ -78,6 +81,7 @@ def parse_graph(program, graph, var_dict, **kwargs): for arg in e.arguments: if var_dict.has_key(arg): graph.edge(**draw_edge(var_dict, op, e, arg)) + break # only plot the first block def draw_graph(startup_program, main_program, **kwargs): diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py index ee677a2c5670a092c509b9ce1c555223bf22957f..a7f3bfc0caf76302674a00c80c2bd9ebf834f872 100644 --- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py +++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py @@ -1,33 +1,22 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.framework as framework -from paddle.v2.fluid.io import save_persistables, load_persistables +import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor +from paddle.v2.fluid.io import save_persistables, load_persistables +from paddle.v2.fluid.optimizer import SGDOptimizer -import numpy as np - -x = layers.data( - name='x', - shape=[13], - data_type='float32') +x = layers.data(name='x', shape=[13], data_type='float32') -y_predict = layers.fc(input=x, - size=1, - act=None) +y_predict = layers.fc(input=x, size=1, act=None) -y = layers.data( - name='y', - shape=[1], - data_type='float32') +y = layers.data(name='y', shape=[1], data_type='float32') -cost = layers.square_error_cost( - input=y_predict, - label=y) +cost = layers.square_error_cost(input=y_predict, label=y) avg_cost = layers.mean(x=cost) -sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) +sgd_optimizer = SGDOptimizer(learning_rate=0.001) opts = sgd_optimizer.minimize(avg_cost) BATCH_SIZE = 20 diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py index f4be835b3ad57d5b0076e8a816c2c3def46e0663..b8506125501b6e533c4594b37943ec36ca8e7d30 100644 --- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py +++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py @@ -1,21 +1,16 @@ import numpy as np import paddle.v2 as paddle import paddle.v2.fluid.core as core +import paddle.v2.fluid.framework as framework import paddle.v2.fluid.layers as layers import paddle.v2.fluid.nets as nets -import paddle.v2.fluid.optimizer as optimizer from paddle.v2.fluid.executor import Executor -import paddle.v2.fluid.framework as framework from paddle.v2.fluid.initializer import XavierInitializer +from paddle.v2.fluid.optimizer import AdamOptimizer def resnet_cifar10(input, depth=32): - def conv_bn_layer(input, - ch_out, - filter_size, - stride, - padding, - act='relu'): + def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): tmp = layers.conv2d( input=input, filter_size=filter_size, @@ -24,9 +19,7 @@ def resnet_cifar10(input, depth=32): padding=padding, act=None, bias_attr=False) - return layers.batch_norm( - input=tmp, - act=act) + return layers.batch_norm(input=tmp, act=act) def shortcut(input, ch_in, ch_out, stride, program, init_program): if ch_in != ch_out: @@ -35,28 +28,11 @@ def resnet_cifar10(input, depth=32): else: return input - def basicblock(input, - ch_in, - ch_out, - stride): - tmp = conv_bn_layer( - input, - ch_out, - 3, - stride, - 1) - tmp = conv_bn_layer( - tmp, - ch_out, - 3, - 1, - 1, - act=None) + def basicblock(input, ch_in, ch_out, stride): + tmp = conv_bn_layer(input, ch_out, 3, stride, 1) + tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None) short = shortcut(input, ch_in, ch_out, stride) - return layers.elementwise_add( - x=tmp, - y=short, - act='relu') + return layers.elementwise_add(x=tmp, y=short, act='relu') def layer_warp(block_func, input, ch_in, ch_out, count, stride): tmp = block_func(input, ch_in, ch_out, stride) @@ -67,45 +43,17 @@ def resnet_cifar10(input, depth=32): assert (depth - 2) % 6 == 0 n = (depth - 2) / 6 conv1 = conv_bn_layer( - input=input, - ch_out=16, - filter_size=3, - stride=1, - padding=1) - res1 = layer_warp( - basicblock, - conv1, - 16, - 16, - n, - 1) - res2 = layer_warp( - basicblock, - res1, - 16, - 32, - n, - 2) - res3 = layer_warp( - basicblock, - res2, - 32, - 64, - n, - 2) + input=input, ch_out=16, filter_size=3, stride=1, padding=1) + res1 = layer_warp(basicblock, conv1, 16, 16, n, 1) + res2 = layer_warp(basicblock, res1, 16, 32, n, 2) + res3 = layer_warp(basicblock, res2, 32, 64, n, 2) pool = layers.pool2d( - input=res3, - pool_size=8, - pool_type='avg', - pool_stride=1) + input=res3, pool_size=8, pool_type='avg', pool_stride=1) return pool def vgg16_bn_drop(input): - def conv_block(input, - num_filter, - groups, - dropouts): + def conv_block(input, num_filter, groups, dropouts): return nets.img_conv_group( input=input, pool_size=2, @@ -123,22 +71,14 @@ def vgg16_bn_drop(input): conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) - drop = layers.dropout( - x=conv5, - dropout_prob=0.5) + drop = layers.dropout(x=conv5, dropout_prob=0.5) fc1 = layers.fc(input=drop, size=512, act=None, param_attr={"initializer": XavierInitializer()}) - reshape1 = layers.reshape( - x=fc1, - shape=list(fc1.shape + (1, 1))) - bn = layers.batch_norm( - input=reshape1, - act='relu') - drop2 = layers.dropout( - x=bn, - dropout_prob=0.5) + reshape1 = layers.reshape(x=fc1, shape=list(fc1.shape + (1, 1))) + bn = layers.batch_norm(input=reshape1, act='relu') + drop2 = layers.dropout(x=bn, dropout_prob=0.5) fc2 = layers.fc(input=drop2, size=512, act=None, @@ -165,8 +105,8 @@ cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(x=cost) accuracy = layers.accuracy(input=predict, label=label) -# optimizer = optimizer.SGDOptimizer(learning_rate=0.001) -optimizer = optimizer.AdamOptimizer(learning_rate=0.001) +# optimizer = SGDOptimizer(learning_rate=0.001) +optimizer = AdamOptimizer(learning_rate=0.001) opts = optimizer.minimize(avg_cost) BATCH_SIZE = 128 diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py index f330ff58137068e429008bc7aa07bbc8d2e35ac4..75fbaf83e8f3e62eb0d0abef9cfa267b65e72973 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py @@ -1,22 +1,15 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers -import paddle.v2.fluid.nets as nets import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.evaluator as evaluator import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers +import paddle.v2.fluid.nets as nets from paddle.v2.fluid.executor import Executor +from paddle.v2.fluid.optimizer import AdamOptimizer -import numpy as np - -images = layers.data( - name='pixel', - shape=[1, 28, 28], - data_type='float32') -label = layers.data( - name='label', - shape=[1], - data_type='int64') +images = layers.data(name='pixel', shape=[1, 28, 28], data_type='float32') +label = layers.data(name='label', shape=[1], data_type='int64') conv_pool_1 = nets.simple_img_conv_pool( input=images, filter_size=5, @@ -32,17 +25,13 @@ conv_pool_2 = nets.simple_img_conv_pool( pool_stride=2, act="relu") -predict = layers.fc(input=conv_pool_2, - size=10, - act="softmax") +predict = layers.fc(input=conv_pool_2, size=10, act="softmax") cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(x=cost) -optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999) +optimizer = AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999) opts = optimizer.minimize(avg_cost) -accuracy, acc_out = evaluator.accuracy( - input=predict, - label=label) +accuracy, acc_out = evaluator.accuracy(input=predict, label=label) BATCH_SIZE = 50 PASS_NUM = 3 diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py index b0164e3e3659c19edf2af45e706fb48ac1fe2b1c..cf10b1942e6a8243b18b0ae4586fdd7ec1a665fb 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py @@ -1,19 +1,15 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.regularizer import L2DecayRegularizer from paddle.v2.fluid.initializer import UniformInitializer - -import numpy as np +from paddle.v2.fluid.optimizer import MomentumOptimizer +from paddle.v2.fluid.regularizer import L2DecayRegularizer BATCH_SIZE = 128 -image = layers.data( - name='x', - shape=[784], - data_type='float32') +image = layers.data(name='x', shape=[784], data_type='float32') param_attr = { 'name': None, @@ -22,32 +18,21 @@ param_attr = { 'regularization': L2DecayRegularizer(0.0005 * BATCH_SIZE) } -hidden1 = layers.fc(input=image, - size=128, - act='relu', - param_attr=param_attr) -hidden2 = layers.fc(input=hidden1, - size=64, - act='relu', - param_attr=param_attr) +hidden1 = layers.fc(input=image, size=128, act='relu', param_attr=param_attr) +hidden2 = layers.fc(input=hidden1, size=64, act='relu', param_attr=param_attr) predict = layers.fc(input=hidden2, size=10, act='softmax', param_attr=param_attr) -label = layers.data( - name='y', - shape=[1], - data_type='int64') +label = layers.data(name='y', shape=[1], data_type='int64') cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(x=cost) -accuracy = layers.accuracy( - input=predict, - label=label) +accuracy = layers.accuracy(input=predict, label=label) -optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9) +optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) opts = optimizer.minimize(avg_cost) train_reader = paddle.batch( diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py index eefcb55bebff41eb9c67d9f0c8e83a5f1d4599bd..55ded3aed3a23c8cd7795f915dc1cbd512c6d945 100644 --- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py +++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py @@ -1,12 +1,11 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers -import paddle.v2.fluid.nets as nets import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers +import paddle.v2.fluid.nets as nets from paddle.v2.fluid.executor import Executor - -import numpy as np +from paddle.v2.fluid.optimizer import SGDOptimizer IS_SPARSE = True USE_GPU = False @@ -19,10 +18,7 @@ def get_usr_combined_features(): USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1 - uid = layers.data( - name='user_id', - shape=[1], - data_type='int64') + uid = layers.data(name='user_id', shape=[1], data_type='int64') usr_emb = layers.embedding( input=uid, @@ -31,15 +27,11 @@ def get_usr_combined_features(): param_attr={'name': 'user_table'}, is_sparse=IS_SPARSE) - usr_fc = layers.fc(input=usr_emb, - size=32) + usr_fc = layers.fc(input=usr_emb, size=32) USR_GENDER_DICT_SIZE = 2 - usr_gender_id = layers.data( - name='gender_id', - shape=[1], - data_type='int64') + usr_gender_id = layers.data(name='gender_id', shape=[1], data_type='int64') usr_gender_emb = layers.embedding( input=usr_gender_id, @@ -47,14 +39,10 @@ def get_usr_combined_features(): param_attr={'name': 'gender_table'}, is_sparse=IS_SPARSE) - usr_gender_fc = layers.fc(input=usr_gender_emb, - size=16) + usr_gender_fc = layers.fc(input=usr_gender_emb, size=16) USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table) - usr_age_id = layers.data( - name='age_id', - shape=[1], - data_type="int64") + usr_age_id = layers.data(name='age_id', shape=[1], data_type="int64") usr_age_emb = layers.embedding( input=usr_age_id, @@ -62,14 +50,10 @@ def get_usr_combined_features(): is_sparse=IS_SPARSE, param_attr={'name': 'age_table'}) - usr_age_fc = layers.fc(input=usr_age_emb, - size=16) + usr_age_fc = layers.fc(input=usr_age_emb, size=16) USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1 - usr_job_id = layers.data( - name='job_id', - shape=[1], - data_type="int64") + usr_job_id = layers.data(name='job_id', shape=[1], data_type="int64") usr_job_emb = layers.embedding( input=usr_job_id, @@ -77,16 +61,12 @@ def get_usr_combined_features(): param_attr={'name': 'job_table'}, is_sparse=IS_SPARSE) - usr_job_fc = layers.fc(input=usr_job_emb, - size=16) + usr_job_fc = layers.fc(input=usr_job_emb, size=16) concat_embed = layers.concat( - input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], - axis=1) + input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1) - usr_combined_features = layers.fc(input=concat_embed, - size=200, - act="tanh") + usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") return usr_combined_features @@ -95,10 +75,7 @@ def get_mov_combined_features(): MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1 - mov_id = layers.data( - name='movie_id', - shape=[1], - data_type='int64') + mov_id = layers.data(name='movie_id', shape=[1], data_type='int64') mov_emb = layers.embedding( input=mov_id, @@ -107,36 +84,24 @@ def get_mov_combined_features(): param_attr={'name': 'movie_table'}, is_sparse=IS_SPARSE) - mov_fc = layers.fc(input=mov_emb, - size=32) + mov_fc = layers.fc(input=mov_emb, size=32) CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories()) - category_id = layers.data( - name='category_id', - shape=[1], - data_type='int64') + category_id = layers.data(name='category_id', shape=[1], data_type='int64') mov_categories_emb = layers.embedding( - input=category_id, - size=[CATEGORY_DICT_SIZE, 32], - is_sparse=IS_SPARSE) + input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE) mov_categories_hidden = layers.sequence_pool( - input=mov_categories_emb, - pool_type="sum") + input=mov_categories_emb, pool_type="sum") MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict()) - mov_title_id = layers.data( - name='movie_title', - shape=[1], - data_type='int64') + mov_title_id = layers.data(name='movie_title', shape=[1], data_type='int64') mov_title_emb = layers.embedding( - input=mov_title_id, - size=[MOV_TITLE_DICT_SIZE, 32], - is_sparse=IS_SPARSE) + input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE) mov_title_conv = nets.sequence_conv_pool( input=mov_title_emb, @@ -146,13 +111,10 @@ def get_mov_combined_features(): pool_type="sum") concat_embed = layers.concat( - input=[mov_fc, mov_categories_hidden, mov_title_conv], - axis=1) + input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1) # FIXME(dzh) : need tanh operator - mov_combined_features = layers.fc(input=concat_embed, - size=200, - act="tanh") + mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") return mov_combined_features @@ -162,18 +124,11 @@ def model(): mov_combined_features = get_mov_combined_features() # need cos sim - inference = layers.cos_sim( - X=usr_combined_features, - Y=mov_combined_features) + inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features) - label = layers.data( - name='score', - shape=[1], - data_type='float32') + label = layers.data(name='score', shape=[1], data_type='float32') - square_cost = layers.square_error_cost( - input=inference, - label=label) + square_cost = layers.square_error_cost(input=inference, label=label) avg_cost = layers.mean(x=square_cost) @@ -182,7 +137,7 @@ def model(): def main(): cost = model() - sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.2) + sgd_optimizer = SGDOptimizer(learning_rate=0.2) opts = sgd_optimizer.minimize(cost) if USE_GPU: diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py index 91fc79a9870a31205098d8a40de6c033d5bf60b9..e69b915a9cfaf9e06075991975563a1fc1196661 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py @@ -1,12 +1,11 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers -import paddle.v2.fluid.nets as nets import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers +import paddle.v2.fluid.nets as nets from paddle.v2.fluid.executor import Executor - -import numpy as np +from paddle.v2.fluid.optimizer import AdamOptimizer def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32): @@ -31,7 +30,7 @@ def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32): act="softmax") cost = layers.cross_entropy(input=prediction, label=label) avg_cost = layers.mean(x=cost) - adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002) + adam_optimizer = AdamOptimizer(learning_rate=0.002) opts = adam_optimizer.minimize(avg_cost) acc = layers.accuracy(input=prediction, label=label) return avg_cost, acc diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py index 8c3d4488354eb363cd1d378ebd4cb8069e7c1b1d..65d44542501e6531fc1912cbc726a1d903b9c031 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py @@ -1,12 +1,10 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers -import paddle.v2.fluid.nets as nets import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor - -import numpy as np +from paddle.v2.fluid.optimizer import AdamOptimizer def stacked_lstm_net(input_dim, @@ -41,7 +39,7 @@ def stacked_lstm_net(input_dim, act='softmax') cost = layers.cross_entropy(input=prediction, label=label) avg_cost = layers.mean(x=cost) - adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002) + adam_optimizer = AdamOptimizer(learning_rate=0.002) opts = adam_optimizer.minimize(avg_cost) acc = layers.accuracy(input=prediction, label=label) return avg_cost, acc diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py index a7d791c1f38d4843f084127e879d613b21ae8daf..280f6e902c34512735a27586221c2be68963ef2b 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py @@ -1,11 +1,10 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor - -import numpy as np +from paddle.v2.fluid.optimizer import AdamOptimizer def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50): @@ -33,7 +32,7 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50): cost = layers.cross_entropy(input=prediction, label=label) avg_cost = layers.mean(x=cost) - adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002) + adam_optimizer = AdamOptimizer(learning_rate=0.002) opts = adam_optimizer.minimize(avg_cost) acc = layers.accuracy(input=prediction, label=label) diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py index 9dcb6f2fea06ea8cd061be4f148854408779f990..afa7b285198e0349317e123e4bd98e8336217afa 100644 --- a/python/paddle/v2/fluid/tests/book/test_word2vec.py +++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py @@ -1,11 +1,10 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor - -import numpy as np +from paddle.v2.fluid.optimizer import SGDOptimizer PASS_NUM = 100 EMBED_SIZE = 32 @@ -17,26 +16,11 @@ IS_SPARSE = True word_dict = paddle.dataset.imikolov.build_dict() dict_size = len(word_dict) -first_word = layers.data( - name='firstw', - shape=[1], - data_type='int64') -second_word = layers.data( - name='secondw', - shape=[1], - data_type='int64') -third_word = layers.data( - name='thirdw', - shape=[1], - data_type='int64') -forth_word = layers.data( - name='forthw', - shape=[1], - data_type='int64') -next_word = layers.data( - name='nextw', - shape=[1], - data_type='int64') +first_word = layers.data(name='firstw', shape=[1], data_type='int64') +second_word = layers.data(name='secondw', shape=[1], data_type='int64') +third_word = layers.data(name='thirdw', shape=[1], data_type='int64') +forth_word = layers.data(name='forthw', shape=[1], data_type='int64') +next_word = layers.data(name='nextw', shape=[1], data_type='int64') embed_first = layers.embedding( input=first_word, @@ -64,19 +48,12 @@ embed_forth = layers.embedding( param_attr={'name': 'shared_w'}) concat_embed = layers.concat( - input=[embed_first, embed_second, embed_third, embed_forth], - axis=1) -hidden1 = layers.fc(input=concat_embed, - size=HIDDEN_SIZE, - act='sigmoid') -predict_word = layers.fc(input=hidden1, - size=dict_size, - act='softmax') -cost = layers.cross_entropy( - input=predict_word, - label=next_word) + input=[embed_first, embed_second, embed_third, embed_forth], axis=1) +hidden1 = layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid') +predict_word = layers.fc(input=hidden1, size=dict_size, act='softmax') +cost = layers.cross_entropy(input=predict_word, label=next_word) avg_cost = layers.mean(x=cost) -sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) +sgd_optimizer = SGDOptimizer(learning_rate=0.001) opts = sgd_optimizer.minimize(avg_cost) train_reader = paddle.batch( diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py index 907b52c405d9e5c02c70f611e4c777ba21948c40..2240dc73cdd31f320fed174dd811e93c6640137f 100644 --- a/python/paddle/v2/fluid/tests/test_conv2d_op.py +++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py @@ -110,13 +110,30 @@ class TestConv2dOp(OpTest): self.op_type = "conv2d" +class TestWithPad(TestConv2dOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] / self.groups + self.filter_size = [6, f_c, 3, 3] + + +class TestWithStride(TestConv2dOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 6, 6] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] / self.groups + self.filter_size = [6, f_c, 3, 3] + + class TestWithGroup(TestConv2dOp): def init_group(self): self.groups = 3 - def init_op_type(self): - self.op_type = "conv2d" - class TestWith1x1(TestConv2dOp): def init_test_case(self): @@ -127,15 +144,9 @@ class TestWith1x1(TestConv2dOp): f_c = self.input_size[1] / self.groups self.filter_size = [6, f_c, 1, 1] - def init_dilation(self): - self.dilations = [1, 1] - def init_group(self): self.groups = 3 - def init_op_type(self): - self.op_type = "conv2d" - class TestWithDilation(TestConv2dOp): def init_test_case(self): @@ -152,14 +163,19 @@ class TestWithDilation(TestConv2dOp): def init_group(self): self.groups = 3 + +#----------------Conv2dCudnn---------------- +class TestCudnn(TestConv2dOp): def init_op_type(self): - self.op_type = "conv2d" + self.op_type = "conv_cudnn" -#----------------Conv2dCudnn---------------- +class TestCudnnWithPad(TestWithPad): + def init_op_type(self): + self.op_type = "conv_cudnn" -class TestCudnn(TestConv2dOp): +class TestCudnnWithStride(TestWithStride): def init_op_type(self): self.op_type = "conv_cudnn" diff --git a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py index 54349c018c4a53b8767d6cd4f94d99c719dc0237..d7b1f2f2a3abf6335998742dbbef8e17794170fa 100644 --- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py +++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py @@ -4,9 +4,7 @@ from op_test import OpTest def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param): - # [2, 3, 5, 5] in_n, in_c, in_h, in_w = input_.shape - # [3, 6, 3, 3] f_c, out_c, f_h, f_w = filter_.shape assert in_c == f_c @@ -29,6 +27,7 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param): j1, j2 = j * stride[0], j * stride[0] + f_w out[n, k, i1:i2, j1:j2] += tmp_out + out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]] return out @@ -36,8 +35,6 @@ class TestConv2dTransposeOp(OpTest): def setUp(self): # init as conv transpose self.init_op_type() - - # [2, 3, 5, 5] -> kernel [3, 6, 3, 3] -> output [2, 6, 7, 7] self.init_test_case() conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad} @@ -55,7 +52,6 @@ class TestConv2dTransposeOp(OpTest): self.outputs = {'Output': output} def test_check_output(self): - print 'check output here for', self.op_type self.check_output() def test_check_grad_no_input(self): @@ -88,6 +84,26 @@ class TestConv2dTransposeOp(OpTest): self.op_type = "conv2d_transpose" +class TestWithPad(TestConv2dTransposeOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.dilations = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + +class TestWithStride(TestConv2dTransposeOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.dilations = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + # ------------ test_cudnn ------------ class TestCudnn(TestConv2dTransposeOp): def init_op_type(self): diff --git a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py index 132fe7931438a30cf02e4ad2894c0838e48ffc9f..8fd34b87bfea91307f52fdcbb9f71f2e1a9c6c56 100644 --- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py +++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py @@ -4,9 +4,7 @@ from op_test import OpTest def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param): - # [2, 3, 5, 5, 5] in_n, in_c, in_d, in_h, in_w = input_.shape - # [3, 6, 3, 3, 3] f_c, out_c, f_d, f_h, f_w = filter_.shape assert in_c == f_c @@ -14,7 +12,6 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param): out_d = (in_d - 1) * stride[0] + f_d out_h = (in_h - 1) * stride[1] + f_h out_w = (in_w - 1) * stride[2] + f_w - out = np.zeros((in_n, out_c, out_d, out_h, out_w)) for n in range(in_n): @@ -33,6 +30,8 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param): j1, j2 = j * stride[2], j * stride[2] + f_w out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out + out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w - + pad[2]] return out @@ -40,8 +39,6 @@ class TestConv3dTransposeOp(OpTest): def setUp(self): # init as conv transpose self.init_op_type() - - # [2, 3, 5, 5, 5] -> kernel [3, 6, 3, 3, 3] -> output [2, 6, 7, 7, 7] self.init_test_case() conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad} @@ -49,7 +46,6 @@ class TestConv3dTransposeOp(OpTest): filter_ = np.random.random(self.filter_size).astype("float32") output = conv3dtranspose_forward_naive( input_, filter_, conv3dtranspose_param).astype("float32") - # print 'deconv output py', output, output.shape self.inputs = {'Input': input_, 'Filter': filter_} self.attrs = { @@ -60,7 +56,6 @@ class TestConv3dTransposeOp(OpTest): self.outputs = {'Output': output} def test_check_output(self): - print 'check output here' self.check_output() def test_check_grad(self): @@ -85,7 +80,7 @@ class TestConv3dTransposeOp(OpTest): self.pad = [0, 0, 0] self.stride = [1, 1, 1] self.dilations = [1, 1, 1] - self.input_size = [2, 3, 5, 5, 5] # NCHW + self.input_size = [2, 3, 5, 5, 5] # NCDHW f_c = self.input_size[1] self.filter_size = [f_c, 6, 3, 3, 3] @@ -93,5 +88,31 @@ class TestConv3dTransposeOp(OpTest): self.op_type = "conv3d_transpose" +class TestWithPad(TestConv3dTransposeOp): + def init_test_case(self): + self.pad = [1, 1, 1] + self.stride = [1, 1, 1] + self.dilations = [1, 1, 1] + self.input_size = [2, 3, 5, 5, 5] # NCDHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3, 3] + + +class TestWithStride(TestConv3dTransposeOp): + def init_test_case(self): + self.pad = [1, 1, 1] + self.stride = [2, 2, 2] + self.dilations = [1, 1, 1] + self.input_size = [2, 3, 5, 5, 5] # NCDHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3, 3] + + +# ------------ test_cudnn ------------ +class TestCudnn(TestConv3dTransposeOp): + def init_op_type(self): + self.op_type = "conv3d_transpose_cudnn" + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_gru_op.py b/python/paddle/v2/fluid/tests/test_gru_op.py index b2474cff94c6c71cc62bc8e69a5d83e38d51c511..fa2c5a53ec4a01b6545e25f773c11277a4d24706 100644 --- a/python/paddle/v2/fluid/tests/test_gru_op.py +++ b/python/paddle/v2/fluid/tests/test_gru_op.py @@ -6,7 +6,8 @@ from test_lstm_op import identity, sigmoid, tanh, relu class TestGRUOp(OpTest): - batch_size = 9 + lod = [[0, 2, 6, 9]] + batch_size = lod[0][-1] frame_size = 5 activate = { 'identity': identity, @@ -35,7 +36,7 @@ class TestGRUOp(OpTest): seq_starts[sorted_seqs[i]] + batch_idx) idx_in_seq.append(idx) idx_in_seq_list.append(idx_in_seq) - return idx_in_seq_list + return idx_in_seq_list, sorted_seqs def gru_step(self, x, h_p, w, b): batch_size = x.shape[0] @@ -66,8 +67,8 @@ class TestGRUOp(OpTest): batch_hidden = self.outputs['BatchHidden'] hidden = self.outputs['Hidden'] idx_in_seq_list = self.idx_in_seq_list - h_p = self.inputs['H0'] if self.inputs.has_key('H0') else np.zeros( - (len(idx_in_seq_list[0]), self.frame_size)) + h_p = self.inputs['H0'][self.sorted_seqs] if self.inputs.has_key( + 'H0') else np.zeros((len(idx_in_seq_list[0]), self.frame_size)) num_batch = len(idx_in_seq_list) end_idx = 0 for batch_idx in range(num_batch): @@ -84,8 +85,9 @@ class TestGRUOp(OpTest): return batch_gate, batch_reset_hidden_prev, hidden def set_data(self): - lod = [[0, 2, 6, self.batch_size]] - self.idx_in_seq_list = self.seq_to_batch(lod, self.is_reverse) + lod = self.lod + self.idx_in_seq_list, self.sorted_seqs = self.seq_to_batch( + lod, self.is_reverse) batch_size = self.batch_size frame_size = self.frame_size input = np.random.rand(batch_size, frame_size * 3).astype('float64') @@ -146,7 +148,7 @@ class TestGRUOpReverse(TestGRUOp): def set_confs(self): self.is_reverse = True self.attrs = { - 'activation': 'identity', + 'activation': 'tanh', 'gate_activation': 'sigmoid', 'is_reverse': self.is_reverse } diff --git a/python/paddle/v2/fluid/tests/test_is_empty_op.py b/python/paddle/v2/fluid/tests/test_is_empty_op.py new file mode 100644 index 0000000000000000000000000000000000000000..ed6e3fe24f6333c9c90d760787eb13241a7e1868 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_is_empty_op.py @@ -0,0 +1,43 @@ +import unittest +import numpy as np +from paddle.v2.fluid.op import Operator +import paddle.v2.fluid.core as core + + +def create_tensor(scope, name, np_data): + tensor = scope.var(name).get_tensor() + tensor.set_dims(np_data.shape) + tensor.set(np_data, core.CPUPlace()) + return tensor + + +class TestIsEmptyOp(unittest.TestCase): + def setUp(self): + self.scope = core.Scope() + # create input variables + np_data0 = np.array([0, 1, 2]) + create_tensor(self.scope, "X0", np_data0) + + np_data1 = np.array([1]) + t = create_tensor(self.scope, "X1", np_data1) + t.set_dims([0]) + + # create output variables + self.scope.var("out") + + def test_no_empty(self): + self.one_case("X0", False) + + def test_empty(self): + self.one_case("X1", True) + + def one_case(self, input, target): + op = Operator(type="is_empty", X=input, Out="out") + ctx = core.DeviceContext.create(core.CPUPlace()) + op.run(self.scope, ctx) + out = self.scope.var("out").get_tensor() + self.assertEqual(np.array(out)[0], target) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py index 0f01acb3b94dc55a3536e751108e785ddc6e47bb..84b432333f950f754a97bc1a051b59c16fb22aed 100644 --- a/python/paddle/v2/fluid/tests/test_while_op.py +++ b/python/paddle/v2/fluid/tests/test_while_op.py @@ -2,6 +2,7 @@ import unittest import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor import paddle.v2.fluid.core as core +from paddle.v2.fluid.backward import append_backward_ops import numpy @@ -16,7 +17,7 @@ class TestWhileOp(unittest.TestCase): i = layers.zeros(shape=[1], dtype='int64') i.stop_gradient = True init = layers.zeros(shape=[10], dtype='float32') - mem_array = layers.array_write(init, i=i) + mem_array = layers.array_write(x=init, i=i) data_array = layers.array_write(x=d0, i=i) i = layers.increment(i) @@ -29,17 +30,23 @@ class TestWhileOp(unittest.TestCase): i.stop_gradient = True array_len = layers.fill_constant(shape=[1], dtype='int64', value=3) + array_len.stop_gradient = True cond = layers.less_than(x=i, y=array_len) while_op = layers.While(cond=cond) with while_op.block(): d = layers.array_read(array=data_array, i=i) prev = layers.array_read(array=mem_array, i=i) - i = layers.increment(x=i, in_place=True) result = layers.sums(input=[d, prev]) + + i = layers.increment(x=i, in_place=True) layers.array_write(result, i=i, array=mem_array) layers.less_than(x=i, y=array_len, cond=cond) - sum_result = layers.array_read(mem_array, i=array_len) + + sum_result = layers.array_read(array=mem_array, i=i) + loss = layers.mean(x=sum_result) + + append_backward_ops(loss) cpu = core.CPUPlace() exe = Executor(cpu)