diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 980a97a07c996eca2e8c126a6ad5ab7f340fa1e5..bb8c88787d37faf9ce4d7d856a307c11f1085d98 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,10 +17,14 @@ - id: detect-private-key files: (?!.*third_party)^.*$ | (?!.*book)^.*$ - id: end-of-file-fixer -- repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git - sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29 +- repo: local hooks: - - id: clang-formater + - id: clang-format + name: clang-format + description: Format files with ClangFormat. + entry: clang-format -i + language: system + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ - repo: https://github.com/PaddlePaddle/pre-commit-golang sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 hooks: diff --git a/.travis.yml b/.travis.yml index 376c693602b56fe719decfeb41c217497e143e12..8c8c6699d3d9abddd65a3a224c2bceedc7d88348 100644 --- a/.travis.yml +++ b/.travis.yml @@ -38,7 +38,7 @@ before_install: # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # protobuf version. - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker - - pip install rarfile + - pip install rarfile nltk==3.2.2 scipy==0.19.0 recordio matplotlib Pillow - curl https://glide.sh/get | bash - eval "$(GIMME_GO_VERSION=1.8.3 gimme)" - go get -u github.com/alecthomas/gometalinter diff --git a/CMakeLists.txt b/CMakeLists.txt index c7d743e193e7d32dbc0b56f3bcb05b6c61f85f1d..b174831109372cb014741d63032fa6a470e74042 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,8 +36,8 @@ include(simd) ################################ Configurations ####################################### option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) -option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." OFF) -option(WITH_MKLML "Compile PaddlePaddle with mklml package." OFF) +option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND}) +option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND}) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) diff --git a/Dockerfile b/Dockerfile index 8cfb16928c95dcbfac08383d32562ff67933d873..06a3d8930769bca2599a7afedb3683b2207cb302 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,25 +27,27 @@ RUN apt-get update && \ git python-pip python-dev openssh-server bison \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ curl sed grep graphviz libjpeg-dev zlib1g-dev \ - python-numpy python-matplotlib gcc g++ \ + python-matplotlib gcc-4.8 g++-4.8 \ automake locales clang-format-3.8 swig doxygen cmake \ liblapack-dev liblapacke-dev libboost-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ net-tools && \ apt-get clean -y +# paddle is using numpy.flip, which is introduced since 1.12.0 +RUN pip --no-cache-dir install 'numpy>=1.12.0' + # Install Go and glide -RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \ - tar -C /usr/local -xzf go.tgz && \ +RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + tar -xz -C /usr/local && \ mkdir /root/gopath && \ mkdir /root/gopath/bin && \ - mkdir /root/gopath/src && \ - rm go.tgz + mkdir /root/gopath/src ENV GOROOT=/usr/local/go GOPATH=/root/gopath # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin # install glide -RUN curl -q https://glide.sh/get | sh +RUN curl -s -q https://glide.sh/get | sh # git credential to skip password typing RUN git config --global credential.helper store diff --git a/README.md b/README.md index 2a6beeb342b34f8e91ef509d7d41f286a666480c..b9793c3eab5d40c28f01cc67ad607b97261b3235 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ We provide [English](http://doc.paddlepaddle.org/develop/doc/) and - [Deep Learning 101](http://book.paddlepaddle.org/index.html) - You might want to start from the this online interactive book that can run in Jupyter Notebook. + You might want to start from this online interactive book that can run in Jupyter Notebook. - [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 69220e03fe8e337205f31cb1f45e3e19ae4f5d1e..2ac098954647d37e26ac2499e0675dae39910edc 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -74,8 +74,6 @@ if(WITH_MKLDNN) set(OPENMP_FLAGS "-fopenmp") set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") else() diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake index 656e1a0803c6e389d70f37f592c3aa2e95a2bcd4..5184f0815faac005b3dff1015395235f4e19d65b 100644 --- a/cmake/cpplint.cmake +++ b/cmake/cpplint.cmake @@ -42,26 +42,21 @@ macro(add_style_check_target TARGET_NAME) if(WITH_STYLE_CHECK) set(SOURCES_LIST ${ARGN}) list(REMOVE_DUPLICATES SOURCES_LIST) - list(SORT SOURCES_LIST) - foreach(filename ${SOURCES_LIST}) - set(LINT ON) foreach(pattern ${IGNORE_PATTERN}) if(filename MATCHES ${pattern}) - message(STATUS "DROP LINT ${filename}") - set(LINT OFF) + list(REMOVE_ITEM SOURCES_LIST ${filename}) endif() endforeach() - if(LINT MATCHES ON) - # cpplint code style - get_filename_component(base_filename ${filename} NAME) - set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint) - add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD - COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py" - "--filter=${STYLE_FILTER}" - "--write-success=${CUR_GEN}" ${filename} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) - endif() endforeach() + + if(SOURCES_LIST) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py" + "--filter=${STYLE_FILTER}" + ${SOURCES_LIST} + COMMENT "cpplint: Checking source code style" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endif() endif() endmacro() diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake index 45e3764e8482a4cfc8ee72fe4d79f04a3c9b74fa..85cce80b70a1fcf57015ac7a264e4950616b2717 100644 --- a/cmake/external/any.cmake +++ b/cmake/external/any.cmake @@ -7,8 +7,8 @@ INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any) ExternalProject_Add( extern_lib_any ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/thelink2012/any.git" - GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020" + GIT_REPOSITORY "https://github.com/PaddlePaddle/any.git" + GIT_TAG "15595d8324be9e8a9a80d9ae442fdd12bd66df5d" PREFIX ${ANY_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index a0d0a892c4b3cc3743ac725f3cd90444f18abf34..16e5bef4cdb8d6513de51838e3c3c8398dbad60d 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) ExternalProject_Add( extern_gflags ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/gflags/gflags.git" + # TODO(yiwang): The annoying warnings mentioned in + # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by + # gflags. I fired a PR https://github.com/gflags/gflags/pull/230 + # to fix it. Before it gets accepted by the gflags team, we use + # my personal fork, which contains above fix, temporarily. Let's + # change this back to the official Github repo once my PR is + # merged. + GIT_REPOSITORY "https://github.com/wangkuiyi/gflags.git" PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index eff15de73f23db6dea3a7b79006bfec90d712ae5..25c6b4ef52d3f8ebff1572ae8d348be7c577c08c 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -20,34 +20,30 @@ INCLUDE(ExternalProject) SET(MKLDNN_PROJECT "extern_mkldnn") SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn) -SET(MKLDNN_INSTALL_ROOT ${CMAKE_INSTALL_PREFIX}) -IF(NOT "$ENV{HOME}" STREQUAL "/root") - SET(MKLDNN_INSTALL_ROOT "$ENV{HOME}") -ENDIF() - -SET(MKLDNN_INSTALL_DIR "${MKLDNN_INSTALL_ROOT}/opt/paddle/third_party/mkldnn") -SET(MKLDNN_INCLUDE_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) +SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) +SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) -IF(WIN32) - MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet." - "Force WITH_MKLDNN=OFF") - SET(WITH_MKLDNN OFF) +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with MKLDNN in Paddle yet." + "Force WITH_MKLDNN=OFF") + SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE) return() -ELSE(WIN32) - SET(MKLDNN_LIBRARY "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) - MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path") - SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) - #SET(CMAKE_MACOSX_RPATH 1) # hold for MacOS - SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") -ENDIF(WIN32) +ENDIF() + +SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) +MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path") +SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") -INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR}) +INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) IF(${CBLAS_PROVIDER} STREQUAL "MKLML") SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) SET(MKLDNN_MKLROOT ${MKLML_ROOT}) SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB}) SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR}) + MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}") ENDIF() ExternalProject_Add( @@ -57,16 +53,15 @@ ExternalProject_Add( GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" GIT_TAG "v0.9" PREFIX ${MKLDNN_SOURCES_DIR} - CONFIGURE_COMMAND mkdir -p /build - BUILD_COMMAND cd /build - && cmake .. -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKLDNN_MKLROOT} - && $(MAKE) - INSTALL_COMMAND cd /build && $(MAKE) install UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} + CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} + -DMKLROOT:PATH=${MKLDNN_MKLROOT} ) ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY}) +SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) -MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIBRARY}") +MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}") LIST(APPEND external_project_dependencies mkldnn) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 3f940756a4abb79aba7d3561db19db8532a0b673..e9fd3d4bedc983ae7c544cf289dc841cf22f9de4 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -16,19 +16,23 @@ IF(NOT ${WITH_MKLML}) return() ENDIF(NOT ${WITH_MKLML}) +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with MKLML in Paddle yet." + "Force WITH_MKLML=OFF") + SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE) + return() +ENDIF() + INCLUDE(ExternalProject) SET(MKLML_PROJECT "extern_mklml") -SET(MKLML_VER "mklml_lnx_2018.0.20170425") +SET(MKLML_VER "mklml_lnx_2018.0.20170720") SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz") SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") -SET(MKLML_DST_DIR "opt/paddle/third_party/mklml") -SET(MKLML_INSTALL_ROOT "${CMAKE_INSTALL_PREFIX}") -IF(NOT "$ENV{HOME}" STREQUAL "/root") - SET(MKLML_INSTALL_ROOT "$ENV{HOME}") -ENDIF() - +SET(MKLML_DST_DIR "mklml") +SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER}) SET(MKLML_INC_DIR ${MKLML_ROOT}/include) @@ -39,22 +43,21 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) -SET(mklml_cmakefile ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt) -FILE(WRITE ${mklml_cmakefile} "PROJECT(MKLML)\n" - "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${MKLML_VER}\n" - " DESTINATION ${MKLML_DST_DIR})\n") +FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(MKLML)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${MKLML_VER}\n" + " DESTINATION ${MKLML_DST_DIR})\n") ExternalProject_Add( ${MKLML_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${MKLML_SOURCE_DIR} DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate -O ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz ${MKLML_URL} - && tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz + DOWNLOAD_COMMAND wget --no-check-certificate -qO- ${MKLML_URL} | tar xz -C ${MKLML_DOWNLOAD_DIR} DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT} ) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 60a1041936437775e0994157b8ffcb7c52b7ab87..db09232c0e69016bf18c1d981e4620e9e804ff7c 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -69,8 +69,13 @@ ENDIF(NOT ${CBLAS_FOUND}) MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) -ADD_LIBRARY(cblas STATIC IMPORTED) -SET_PROPERTY(TARGET cblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES}) +# FIXME(gangliao): generate cblas target to track all high performance +# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas) +SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c) +FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") +ADD_LIBRARY(cblas STATIC ${dummyfile}) +TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) + IF(NOT ${CBLAS_FOUND}) ADD_DEPENDENCIES(cblas extern_openblas) LIST(APPEND external_project_dependencies cblas) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index 67a359d4b5f4cca8fc8e74eab4d4acb4cc12baed..490c87d67ed79a238dd506127cd4d9855fab6626 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -24,7 +24,6 @@ IF(WITH_PYTHON) ENDIF(WITH_PYTHON) SET(py_env "") -SET(USE_VIRTUALENV_FOR_TEST 1) IF(PYTHONINTERP_FOUND) find_python_module(pip REQUIRED) find_python_module(numpy REQUIRED) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index ef31c252038ce18655913c0f41343fe6dc7dbb86..e26d8d9df386e65137aa83cc60a43bfeabf7a4a6 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -9,6 +9,11 @@ function(CheckCompilerCXX11Flag) if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8) message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.") endif() + # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem. + # Use Debug mode instead for now. + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE) + endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang" # Apple Clang is a different compiler than upstream Clang which havs different version numbers. @@ -110,7 +115,7 @@ set(COMMON_FLAGS -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs - -Wno-error=parentheses-equality # Warnings in Pybind11 + -Wno-error=parentheses-equality # Warnings in pybind11 ) set(GPU_COMMON_FLAGS @@ -190,6 +195,7 @@ endif() # Modern gpu architectures: Pascal if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0") list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60") + list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr) endif() # Custom gpu architecture diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 534be0abe246ac70950d85ad05441825c8ca768a..957c20bcf603f2f264b4658f63ac0eec438f12b1 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -187,7 +187,13 @@ function(cc_library TARGET_NAME) endif() # cpplint code style - add_style_check_target(${TARGET_NAME} ${cc_library_SRCS}) + foreach(source_file ${cc_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS}) else(cc_library_SRCS) if (cc_library_DEPS) @@ -239,6 +245,14 @@ function(nv_library TARGET_NAME) add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) target_link_libraries(${TARGET_NAME} ${nv_library_DEPS}) endif() + # cpplint code style + foreach(source_file ${nv_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS}) else(nv_library_SRCS) if (nv_library_DEPS) merge_static_libs(${TARGET_NAME} ${nv_library_DEPS}) @@ -389,3 +403,16 @@ function(py_proto_compile TARGET_NAME) protobuf_generate_python(py_srcs ${py_proto_compile_SRCS}) add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs}) endfunction() + +function(py_test TARGET_NAME) + if(WITH_TESTING) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_test(NAME ${TARGET_NAME} + COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR} + python2 ${py_test_SRCS} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endif() +endfunction() diff --git a/cmake/util.cmake b/cmake/util.cmake index 87ad9d91d8701c56255c1e7f224764998df634a7..4a27623b7ffc0b389680baee52db440c78442f46 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -118,7 +118,6 @@ endfunction() macro(add_unittest_without_exec TARGET_NAME) add_executable(${TARGET_NAME} ${ARGN}) link_paddle_test(${TARGET_NAME}) - add_style_check_target(${TARGET_NAME} ${ARGN}) endmacro() # add_unittest @@ -150,9 +149,12 @@ endfunction() # Create a python unittest using run_python_tests.sh, # which takes care of making correct running environment function(add_python_test TEST_NAME) - add_test(NAME ${TEST_NAME} - COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR} - bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh - ${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + foreach(arg ${ARGN}) + get_filename_component(py_fn ${arg} NAME_WE) + set(TRG_NAME ${TEST_NAME}_${py_fn}) + add_test(NAME ${TRG_NAME} + COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR} + python2 ${arg} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endforeach() endfunction() diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index ec7f1446cfb74842af7d0c7152bebf58619f3861..cb330ea5e1b914587a725c9b90a33053f3fbbc3d 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -104,6 +104,11 @@ cross_channel_norm ------------------ .. autoclass:: paddle.v2.layer.cross_channel_norm :noindex: + +row_l2_norm +----------- +.. autoclass:: paddle.v2.layer.row_l2_norm + :noindex: Recurrent Layers ================ @@ -252,6 +257,16 @@ seq_concat .. autoclass:: paddle.v2.layer.seq_concat :noindex: +kmax_sequence_score +------------------- +.. autoclass:: paddle.v2.layer.kmax_sequence_score + :noindex: + +sub_nested_seq +-------------- +.. autoclass:: paddle.v2.layer.sub_nested_seq + :noindex: + Reshaping Layers ================ @@ -320,6 +335,11 @@ scaling .. autoclass:: paddle.v2.layer.scaling :noindex: +clip +---- +.. autoclass:: paddle.v2.layer.clip + :noindex: + slope_intercept --------------- .. autoclass:: paddle.v2.layer.slope_intercept diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..e956994431fbb43438c56dcd96ad8313cf516090 --- /dev/null +++ b/doc/design/mkldnn/README.MD @@ -0,0 +1,110 @@ +# Intel® MKL-DNN on PaddlePaddle: Design Doc + +我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle,充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。 + +我们短期内的基本目标是: + +- 完成常用layer的MKL-DNN实现。 +- 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。 + + +## Contents + +- [Overview](#overview) +- [Actions](#actions) + - [CMake](#cmake) + - [Layers](#layers) + - [Activations](#activations) + - [Unit Tests](#unit-tests) + - [Protobuf Messages](#protobuf-messages) + - [Python API](#python-api) + - [Demos](#demos) + - [Benchmarking](#benchmarking) + - [Others](#others) +- [Design Concerns](#design-concerns) + +## Overview + +我们会把MKL-DNN作为第三方库集成进PaddlePaddle,整体框架图 +
+
+Figure 1. PaddlePaddle on IA. +
+ +## Actions +我们把集成方案大致分为了如下几个方面。 + +### CMake +我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项,当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。 + +同时,我们会引入`WITH_MKLML`选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。 + +所以,我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。 + +**备注**:当`WITH_MKLML=ON`的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动`cmake/cblas.cmake`中的逻辑。 + +### Layers +所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在 +`paddle/gserver/layers`中,并且文件名都会一以*Mkldnn*开头。 + +所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`。 + +### Activations +由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口,实现方法还是会在`ActivationFunction.cpp`文件。 + +### Unit Tests +会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp`和`MkldnnTester.*`用于MKL-DNN的测试。 + +Activation的测试,计划在PaddlePaddle原有的测试文件上直接添加新的测试type。 + +### Protobuf Messages +根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。 + +### Python API +目前只考虑**v1 API**。 + +计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择,方便用户选择使用MKL-DNN的layers。 + +具体实现方式比如: + +```python +use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0))) +if use_mkldnn + self.layer_type = mkldnn_* +``` + +所有MKL-DNN的layer type会以*mkldnn_*开头,以示区分。 + +并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py `和`layers.py`里面添加必要的MKL-DNN的接口。 + +### Demos + +会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。 + +### Benchmarking +会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`,添加使用MKL-DNN的测试。 + +### Others +1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为64。 +2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。 + +## Design Concerns + +为了更好的符合PaddlePaddle的代码风格\[[2](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]。 + +我们总结出一些特别需要注意的点: + +1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MkldnnLayer`特有的设备ID。 +2. 重写父类Layer的**init**函数,修改`deviceId_`为`-2`,代表这个layer是用于跑在MKL-DNN的环境下。 +3. 创建`MkldnnMatrix`,用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。 +4. 创建`MkldnnBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream`和`CpuEngine`,和未来可能还会用到`FPGAEngine`等。 +5. 在**Argument**里添加两个`MkldnnMatrixPtr`,取名为`mkldnnValue`和`mkldnnGrad`,用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名),用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。 +6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑,用于判断`deviceId`,并针对device在MKL-DNN和CPU之间不统一的情况,做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。 +7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 + +## References + +1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN") +2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。 +3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`,所以不存在这个问题),所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。 + diff --git a/doc/design/mkldnn/image/overview.png b/doc/design/mkldnn/image/overview.png new file mode 100644 index 0000000000000000000000000000000000000000..84b455c28230703599a2529f014cfbb222138fef Binary files /dev/null and b/doc/design/mkldnn/image/overview.png differ diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md index 3692a5248a355cfcfd1cfd0911d43d65166921b1..0c10e782808ca6456347ec54cb5e921162731ede 100644 --- a/doc/design/releasing_process.md +++ b/doc/design/releasing_process.md @@ -11,6 +11,15 @@ Paddle每次发新的版本,遵循以下流程: * 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。 * 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性 * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,返回第二步 + * 编译这个版本的python wheel包,并发布到pypi。 + * 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。 + * pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel`。 + * 上传方法: + ``` + cd build/python + pip install twine + twine upload dist/[package to upload] + ``` 4. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。 5. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面 6. 协同完成Release Note的书写 diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst index 87c286a1af75e08313813f1373ea03b85d4af523..02b96bb413156786db6dc77696c5640b97c10aa4 100644 --- a/doc/getstarted/build_and_install/docker_install_cn.rst +++ b/doc/getstarted/build_and_install/docker_install_cn.rst @@ -3,6 +3,43 @@ PaddlePaddle的Docker容器使用方式 PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行。 请注意,您需要更改 `Dockers设置 `_ 才能充分利用Mac OS X和Windows上的硬件资源。 +Docker使用入门 +------------------------------ + +几个基础的概念帮助理解和使用Docker: + +- *镜像*:一个Docker镜像是一个打包好的软件。它包含了这个软件本身和它所依赖的运行环境。PaddlePaddle的Docker镜像就包含了PaddlePaddle的Python库以及其依赖的多个Python库。这样我们可以直接在Docker中运行需要的程序而不需要安装后在执行。可以执行: + + .. code-block:: bash + + docker images + + 来列出当前系统中的所有镜像,同样可以执行: + + .. code-block:: bash + + docker pull paddlepaddle/paddle:0.10.0 + + 来下载Docker镜像,paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的,推荐国内用户使用ocker.paddlepaddle.org/paddle下载。 + +- *容器*: 如果说一个Docker镜像就是一个程序,那容器就是这个程序运行时产生的“进程”。 + 实际上,一个容器就是一个操作系统的进程,但是是运行在独立的进程空间,文件系统以及网络之上。 + 可以执行: + + .. code-block:: bash + + docker run paddlepaddle/paddle:0.10.0 + + 来使用一个镜像启动一个容器。 + +- 默认情况下,Docker容器会运行在独立的文件系统空间之上,我们无法在Docker容器中 + 访问到主机上的文件。可以通过*挂载Volume*的方式,将主机上的文件或目录挂载到 + Docker容器中。下面的命令把当前目录挂载到了容器中的 /data 目录下,容器使用 + debian镜像,并且启动后执行 :code:`ls /data`。 + + .. code-block:: bash + + docker run --rm -v $(pwd):/data debian ls /data PaddlePaddle发布的Docker镜像使用说明 ------------------------------ @@ -12,11 +49,11 @@ PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打 像,称为生产镜像,里面涵盖了PaddlePaddle运行所需的所有环境。每次 PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运 行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在 -`dockerhub.com `_ 提供最新 -的Docker镜像,可以在"tags"标签下找到最新的Paddle镜像版本。为了方便在国 -内的开发者下载Docker镜像,我们提供了国内的镜像服务器供大家使用。如果您 -在国内,请把文档里命令中的paddlepaddle/paddle替换成 -docker.paddlepaddle.org/paddle。 +`dockerhub.com `_ +和国内镜像`docker.paddlepaddle.org` 提供最新 +的Docker镜像,可以在"tags"标签下找到最新的Paddle镜像版本。 + +**注意:为了方便在国内的开发者下载Docker镜像,我们提供了国内的镜像服务器供大家使用。如果您在国内,请把文档里命令中的paddlepaddle/paddle替换成docker.paddlepaddle.org/paddle。** 1. 开发镜像::code:`paddlepaddle/paddle:0.10.0-dev` @@ -68,6 +105,8 @@ docker.paddlepaddle.org/paddle。 如果输出是No,就需要选择使用no-AVX的镜像 + **注:在0.10.0之后的版本,PaddlePaddle都可以自动判断硬件是否支持AVX,所以无需判断AVX即可使用** + 以上方法在GPU镜像里也能用,只是请不要忘记提前在物理机上安装GPU最新驱动。 为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。 diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst index b6fd3329b273aabe80edd5f1ff064a311648b3c2..94860240f6a4a9bed8a865684a8a79960489280e 100644 --- a/doc/getstarted/build_and_install/docker_install_en.rst +++ b/doc/getstarted/build_and_install/docker_install_en.rst @@ -63,12 +63,35 @@ CPU-only version and a CUDA GPU version and their no-AVX versions. We put the docker images on `dockerhub.com `_. You can find the -latest versions under "tags" tab at dockerhub.com. If you are in -China, you can use our Docker image registry mirror to speed up the -download process. To use it, please replace all paddlepaddle/paddle in -the commands to docker.paddlepaddle.org/paddle. +latest versions under "tags" tab at dockerhub.com. -1. Production images, this image might have multiple variants: +** NOTE: If you are in China, you can use our Docker image registry mirror to speed up the download process. To use it, please replace all paddlepaddle/paddle in the commands to docker.paddlepaddle.org/paddle.** + + +1. development image :code:`paddlepaddle/paddle:-dev` + + This image has packed related develop tools and runtime + environment. Users and developers can use this image instead of + their own local computer to accomplish development, build, + releasing, document writing etc. While different version of paddle + may depends on different version of libraries and tools, if you + want to setup a local environment, you must pay attention to the + versions. The development image contains: + + - gcc/clang + - nvcc + - Python + - sphinx + - woboq + - sshd + + Many developers use servers with GPUs, they can use ssh to login to + the server and run :code:`docker exec` to enter the docker + container and start their work. Also they can start a development + docker image with SSHD service, so they can login to the container + and start work. + +2. Production images, this image might have multiple variants: - GPU/AVX::code:`paddlepaddle/paddle:-gpu` - GPU/no-AVX::code:`paddlepaddle/paddle:-gpu-noavx` @@ -84,7 +107,7 @@ the commands to docker.paddlepaddle.org/paddle. if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi - + **NOTE:versions after 0.10.0 will automatically detect system AVX support, so manual detect is not needed in this case.** To run the CPU-only image as an interactive container: .. code-block:: bash @@ -103,29 +126,6 @@ the commands to docker.paddlepaddle.org/paddle. nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash -2. development image :code:`paddlepaddle/paddle:-dev` - - This image has packed related develop tools and runtime - environment. Users and developers can use this image instead of - their own local computer to accomplish development, build, - releasing, document writing etc. While different version of paddle - may depends on different version of libraries and tools, if you - want to setup a local environment, you must pay attention to the - versions. The development image contains: - - - gcc/clang - - nvcc - - Python - - sphinx - - woboq - - sshd - - Many developers use servers with GPUs, they can use ssh to login to - the server and run :code:`docker exec` to enter the docker - container and start their work. Also they can start a development - docker image with SSHD service, so they can login to the container - and start work. - Train Model Using Python API ---------------------------- diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in index 95cad835b11816f4d2e256c2abd662a545a5bad2..673948dfe7928240817b552141ec9bc2f8a672b7 100644 --- a/doc/templates/conf.py.cn.in +++ b/doc/templates/conf.py.cn.in @@ -13,15 +13,11 @@ # serve to show the default. import sys import os, subprocess +sys.path.insert(0, os.path.abspath('@PROJ_ROOT@/python')) import shlex from recommonmark import parser, transform -try: - import py_paddle - import paddle - import paddle.v2 -except ImportError: - print("Must install paddle python package before generating documentation") - sys.exit(1) +import paddle +import paddle.v2 MarkdownParser = parser.CommonMarkParser AutoStructify = transform.AutoStructify diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in index b477f0120c4fa0544012080b7cfb8572d3c44b04..b6b50b7dcd5647b50a13703160489323ed90a1b4 100644 --- a/doc/templates/conf.py.en.in +++ b/doc/templates/conf.py.en.in @@ -13,15 +13,11 @@ # serve to show the default. import sys import os, subprocess +sys.path.insert(0, os.path.abspath('@PROJ_ROOT@/python')) import shlex from recommonmark import parser, transform -try: - import py_paddle - import paddle - import paddle.v2 -except ImportError: - print("Must install paddle python package before generating documentation") - sys.exit(1) +import paddle +import paddle.v2 MarkdownParser = parser.CommonMarkParser diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go index f9cd8f87e8f2e715c87834ee08482be0f511f681..bec5775d540729000ab2dd3002600f0a92619d70 100644 --- a/go/cmd/pserver/pserver.go +++ b/go/cmd/pserver/pserver.go @@ -32,7 +32,7 @@ import ( func main() { port := flag.Int("port", 0, "port of the pserver") - index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0") + index := flag.Int("index", -1, "index of the pserver, set to -1 if use etcd for auto pserver index registry") etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379", "comma separated endpoint string for pserver to connect to etcd") dialTimeout := flag.Duration("dial-timeout", 5*time.Second, "dial timeout") @@ -60,12 +60,12 @@ func main() { idx, err = e.Register(*port) candy.Must(err) - cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e) + cp, err = pserver.LoadCheckpoint(e, idx) if err != nil { if err == pserver.ErrCheckpointNotFound { log.Infof("Could not find the pserver checkpoint.") } else { - log.Errorf("Fetch checkpoint failed, %s", err) + panic(err) } } } diff --git a/go/glide.lock b/go/glide.lock index 1f16abdf66422abcd0ab7987cab3499d02cf1b9c..be1fb24d772a6524cb798c6169c23ff03e9fed7b 100644 --- a/go/glide.lock +++ b/go/glide.lock @@ -1,5 +1,5 @@ -hash: 2a1c0eca5c07a130e3d224f9821f96cfa37a39bf6bce141c855bbc57ef569f1c -updated: 2017-07-29T07:34:48.722757905+08:00 +hash: 1b9b07408ca7fac27a374dc2ccd2433e4bff090484008a037df967284949a582 +updated: 2017-08-03T21:46:51.744995189Z imports: - name: github.com/beorn7/perks version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9 @@ -145,6 +145,8 @@ imports: version: a1dba9ce8baed984a2495b658c82687f8157b98f subpackages: - xfs +- name: github.com/satori/go.uuid + version: 879c5887cd475cd7864858769793b2ceb0d44feb - name: github.com/sirupsen/logrus version: a3f95b5c423586578a4e099b11a46c2479628cac - name: github.com/topicai/candy diff --git a/go/glide.yaml b/go/glide.yaml index bc23fa6ebf2c3db61e2d63e5f7e7ddcb595dfed0..a90e71b615de92d64c79823e2a04c46001963932 100644 --- a/go/glide.yaml +++ b/go/glide.yaml @@ -14,11 +14,13 @@ import: version: ^1.0.0 - package: github.com/topicai/candy - package: golang.org/x/crypto - vcs: git repo: https://github.com/golang/crypto.git -- package: golang.org/x/sys vcs: git +- package: golang.org/x/sys repo: https://github.com/golang/sys.git -- package: golang.org/x/text vcs: git +- package: golang.org/x/text repo: https://github.com/golang/text.git + vcs: git +- package: github.com/satori/go.uuid + version: v1.1.0 diff --git a/go/master/service.go b/go/master/service.go index d30e9a33229c0aff354417771b5bf2ae6a781715..df7c6860e6ae13a5be7d0425273812208685ee9d 100644 --- a/go/master/service.go +++ b/go/master/service.go @@ -77,11 +77,12 @@ type taskEntry struct { NumFailure int } -type taskQueues struct { +type masterState struct { Todo []taskEntry Pending map[int]taskEntry // map from task ID to task entry Done []taskEntry Failed []taskEntry + CurPass int } // Service is the master server service. @@ -94,11 +95,11 @@ type Service struct { ready chan struct{} initDone bool - mu sync.Mutex - taskQueues taskQueues - currPass int - jobTasks []taskEntry - + mu sync.Mutex + // State to be persisted to snapshot. + state masterState + // The trainer that is currently saving model. This state is + // transient, does not need to be persisted to snapshot. savingTrainer string } @@ -141,8 +142,8 @@ func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failur s.chunksPerTask = chunksPerTask s.timeoutDur = timeoutDur s.failureMax = failureMax - s.taskQueues = taskQueues{} - s.taskQueues.Pending = make(map[int]taskEntry) + s.state = masterState{} + s.state.Pending = make(map[int]taskEntry) s.ready = make(chan struct{}) s.store = store recovered, err := s.recover() @@ -180,7 +181,7 @@ func (s *Service) recover() (bool, error) { } dec := gob.NewDecoder(gr) - var tqs taskQueues + var tqs masterState err = dec.Decode(&tqs) if err != nil { return false, err @@ -193,7 +194,12 @@ func (s *Service) recover() (bool, error) { log.Errorln(err) } - s.taskQueues = tqs + s.state = tqs + log.WithFields(s.logFields()).Infof("Master recovered from snapshot, scheduling pending task timeout check.") + for _, t := range s.state.Pending { + time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch)) + } + return true, nil } @@ -208,7 +214,7 @@ func (s *Service) snapshot() error { var buf bytes.Buffer gw := gzip.NewWriter(&buf) enc := gob.NewEncoder(gw) - err := enc.Encode(s.taskQueues) + err := enc.Encode(s.state) if err != nil { return err } @@ -290,8 +296,7 @@ func (s *Service) SetDataset(globPaths []string, _ *int) error { return err } - s.jobTasks = partition(chunks, s.chunksPerTask) - s.taskQueues.Todo = s.jobTasks + s.state.Todo = partition(chunks, s.chunksPerTask) err = s.snapshot() if err != nil { @@ -319,17 +324,17 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) { } }() - delete(s.taskQueues.Pending, t.Task.Meta.ID) + delete(s.state.Pending, t.Task.Meta.ID) t.NumFailure++ if t.NumFailure > s.failureMax { log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure) - s.taskQueues.Failed = append(s.taskQueues.Failed, t) + s.state.Failed = append(s.state.Failed, t) return } log.Warningf("Task %v failed %d times, re-dispatch.", t.Task, t.NumFailure) - s.taskQueues.Todo = append(s.taskQueues.Todo, t) + s.state.Todo = append(s.state.Todo, t) return } @@ -338,7 +343,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() { s.mu.Lock() defer s.mu.Unlock() - t, ok := s.taskQueues.Pending[taskID] + t, ok := s.state.Pending[taskID] if !ok { return } @@ -350,10 +355,11 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() { // must be called with lock held. func (s *Service) logFields() log.Fields { return log.Fields{ - "todoLen": len(s.taskQueues.Todo), - "pendingLen": len(s.taskQueues.Pending), - "doneLen": len(s.taskQueues.Done), - "failedLen": len(s.taskQueues.Failed), + "todoLen": len(s.state.Todo), + "pendingLen": len(s.state.Pending), + "doneLen": len(s.state.Done), + "failedLen": len(s.state.Failed), + "curPass": s.state.CurPass, } } @@ -366,17 +372,17 @@ func (s *Service) GetTask(passID int, task *Task) error { s.mu.Lock() defer s.mu.Unlock() - if passID < s.currPass { + if passID < s.state.CurPass { return ErrPassBefore } - if passID > s.currPass { + if passID > s.state.CurPass { // Client may get run to pass after master when one client faster than the // other return ErrPassAfter } - if len(s.taskQueues.Todo) == 0 { - if len(s.taskQueues.Done) == 0 && len(s.taskQueues.Pending) == 0 { + if len(s.state.Todo) == 0 { + if len(s.state.Done) == 0 && len(s.state.Pending) == 0 { log.WithFields(s.logFields()).Warningln("All tasks failed, may start next pass") return ErrAllTaskFailed } @@ -384,10 +390,10 @@ func (s *Service) GetTask(passID int, task *Task) error { return ErrNoMoreAvailable } - t := s.taskQueues.Todo[0] + t := s.state.Todo[0] t.Task.Meta.Epoch++ - s.taskQueues.Todo = s.taskQueues.Todo[1:] - s.taskQueues.Pending[t.Task.Meta.ID] = t + s.state.Todo = s.state.Todo[1:] + s.state.Pending[t.Task.Meta.ID] = t err := s.snapshot() if err != nil { return err @@ -409,7 +415,7 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error { s.mu.Lock() defer s.mu.Unlock() - t, ok := s.taskQueues.Pending[taskID] + t, ok := s.state.Pending[taskID] if !ok { log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID) return nil @@ -417,18 +423,18 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error { // task finished, reset timeout t.NumFailure = 0 - s.taskQueues.Done = append(s.taskQueues.Done, t) - delete(s.taskQueues.Pending, taskID) + s.state.Done = append(s.state.Done, t) + delete(s.state.Pending, taskID) log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID) - if len(s.taskQueues.Todo) == 0 && len(s.taskQueues.Pending) == 0 { + if len(s.state.Todo) == 0 && len(s.state.Pending) == 0 { // increase master side pass count if all tasks finished - s.currPass++ - s.taskQueues.Todo = s.jobTasks - s.taskQueues.Done = []taskEntry{} + s.state.CurPass++ + s.state.Todo = append(s.state.Done, s.state.Failed...) + s.state.Done = []taskEntry{} // TODO(typhoonzero): deal with failed tasks - s.taskQueues.Failed = []taskEntry{} - log.WithFields(s.logFields()).Warningf("all task finished, add new pass data, newpass: %d.", s.currPass) + s.state.Failed = []taskEntry{} + log.WithFields(s.logFields()).Warningf("all task finished, add new pass data, newpass: %d.", s.state.CurPass) } err := s.snapshot() @@ -447,7 +453,7 @@ func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error { s.mu.Lock() defer s.mu.Unlock() - t, ok := s.taskQueues.Pending[meta.ID] + t, ok := s.state.Pending[meta.ID] if !ok { log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Task.Meta) return nil diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go index b630d434dca283df67f5b850b35057870fe27529..1243ebd6836550d58144b5033e2755ae8594e948 100644 --- a/go/pserver/client/client_test.go +++ b/go/pserver/client/client_test.go @@ -59,7 +59,7 @@ func initClient() [numPserver]int { go func(l net.Listener) { var cp pserver.Checkpoint - s, err := pserver.NewService(0, 1, "", nil, cp) + s, err := pserver.NewService(0, time.Hour, "", nil, cp) if err != nil { panic(err) } diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go index b6ff1fec8a6f37f61f38cb5d004b1d2c886473ed..977ae5af37e2b7d647ae16af9c4403f916b0216d 100644 --- a/go/pserver/client/etcd_client.go +++ b/go/pserver/client/etcd_client.go @@ -103,7 +103,7 @@ func (p *EtcdClient) List() []Server { time.Sleep(p.timeout) continue } - log.Infof("got value (%s) for key: %s", psAddr, psKey) + log.Debugf("got value (%s) for key: %s", psAddr, psKey) servers[i].Index = i servers[i].Addr = psAddr } diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go index 4fb26307667295ab825d07be6c3d1d4b33f6eb8b..41f0640fc09a3265c0e11c06255c7ee834983203 100644 --- a/go/pserver/etcd_client.go +++ b/go/pserver/etcd_client.go @@ -206,6 +206,7 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) { if err != nil { return []byte{}, err } + kvs := resp.Kvs if len(kvs) == 0 { return []byte{}, nil @@ -215,9 +216,14 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) { } // PutKey put into etcd with value by key specified -func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) error { +func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration, withLease bool) error { ctx, cancel := context.WithTimeout(context.Background(), timeout) - _, err := e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease())) + var err error + if withLease { + _, err = e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease())) + } else { + _, err = e.client.Put(ctx, key, string(value)) + } cancel() return err } diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go index 709160d45d98b6cf6d60f52ceb3fb33e0a0bd17d..ae7359073494bd9cb6b70b12af4daca064179556 100644 --- a/go/pserver/optimizer.go +++ b/go/pserver/optimizer.go @@ -32,6 +32,7 @@ type optimizer struct { opt *C.struct_paddle_optimizer elementType ElementType contentLen int + config []byte } func cArrayToSlice(p unsafe.Pointer, len int) []byte { @@ -70,6 +71,7 @@ func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer cstate = unsafe.Pointer(&s[0]) } + o.config = c o.opt = C.paddle_create_optimizer((*C.uchar)(&c[0]), C.int(len(c)), C.paddle_element_type(p.ElementType), cbuffer, C.int(paramBufferSize), (*C.char)(cstate), C.int(len(s))) return o diff --git a/go/pserver/service.go b/go/pserver/service.go index 7d297c46d03bf78d18ca9830a318968397119d3e..25751540a9a2dff043c14e0912bfab1aaa938ab4 100644 --- a/go/pserver/service.go +++ b/go/pserver/service.go @@ -25,11 +25,13 @@ import ( "fmt" "io/ioutil" "os" - "path/filepath" + "path" "strconv" "sync" "time" + uuid "github.com/satori/go.uuid" + log "github.com/sirupsen/logrus" ) @@ -42,9 +44,9 @@ var ErrCheckpointNotFound = errors.New("checkpoint not found") // RPC error message. const ( - AlreadyInitialized = "pserver already initialized" - Uninitialized = "pserver not fully initialized" - CheckpointMD5Failed = "checkpoint file MD5 validation failed" + AlreadyInitialized = "pserver already initialized" + Uninitialized = "pserver not fully initialized" + WrongChecksum = "checkpoint file checksum validation failed" ) // Supported element types. @@ -73,11 +75,12 @@ type ParameterWithConfig struct { // checkpointMeta saves checkpoint metadata type checkpointMeta struct { UUID string `json:"uuid"` + Path string `json:"path"` MD5 string `json:"md5"` Timestamp int64 `json:"timestamp"` } -// Checkpoint is the pserver shard persist in file +// Checkpoint is the pserver shard persist in file. type Checkpoint []parameterCheckpoint // Gradient is the gradient of the parameter. @@ -90,50 +93,58 @@ type Service struct { checkpointInterval time.Duration checkpointPath string client *EtcdClient - mu sync.Mutex - optMap map[string]*optimizer + + mu sync.Mutex + optMap map[string]*optimizer } -// parameterCheckpoint saves parameter checkpoint +// parameterCheckpoint saves parameter checkpoint. type parameterCheckpoint struct { ParameterWithConfig State []byte } -// NewCheckpointFromFile loads parameters and state from checkpoint file -func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (Checkpoint, error) { - v, err := e.GetKey(PsPath+string(idx), 3*time.Second) +func loadMeta(e *EtcdClient, idx int) (meta checkpointMeta, err error) { + v, err := e.GetKey(PsCheckpoint+strconv.Itoa(idx), 3*time.Second) if err != nil { - return nil, err + return } if len(v) == 0 { - return nil, ErrCheckpointNotFound + err = ErrCheckpointNotFound + return } - var cpMeta checkpointMeta - if err = json.Unmarshal(v, &cpMeta); err != nil { - return nil, err + if err = json.Unmarshal(v, &meta); err != nil { + return } - fn := filepath.Join(cpPath, cpMeta.UUID) - if _, err = os.Stat(fn); os.IsNotExist(err) { + return +} + +// LoadCheckpoint loads checkpoint from file. +func LoadCheckpoint(e *EtcdClient, idx int) (Checkpoint, error) { + cpMeta, err := loadMeta(e, idx) + if err != nil { return nil, err } - content, err := ioutil.ReadFile(fn) + + content, err := ioutil.ReadFile(cpMeta.Path) if err != nil { return nil, err } + // TODO(helin): change MD5 to CRC since CRC is better for file + // checksum in our use case (emphasize speed over security). h := md5.New() md5 := hex.EncodeToString(h.Sum(content)) if md5 != cpMeta.MD5 { - return nil, errors.New(CheckpointMD5Failed) + return nil, errors.New(WrongChecksum) } dec := gob.NewDecoder(bytes.NewReader(content)) - cp := Checkpoint{} - if err = dec.Decode(cp); err != nil { + var cp Checkpoint + if err = dec.Decode(&cp); err != nil { return nil, err } return cp, nil @@ -193,6 +204,15 @@ func (s *Service) FinishInitParams(_ int, _ *int) error { } close(s.initialized) + go func() { + t := time.Tick(s.checkpointInterval) + for range t { + err := s.checkpoint() + if err != nil { + log.Errorln(err) + } + } + }() return nil } @@ -240,23 +260,36 @@ func (s *Service) GetParam(name string, parameter *Parameter) error { return nil } -// pserver save checkpoint -func (s *Service) doCheckpoint() (err error) { - <-s.initialized - s.mu.Lock() - defer s.mu.Unlock() +func traceTime(start time.Time, name string) { + elapsed := time.Since(start) + log.Infof("%s took %v", name, elapsed) +} + +// checkpoint saves checkpoint to disk. +// +// checkpoint should be only called after the parameters are +// initialized. +func (s *Service) checkpoint() (err error) { + log.Infoln("Begin save checkpoint.") + defer traceTime(time.Now(), "save checkpoint") + s.mu.Lock() cp := make([]parameterCheckpoint, len(s.optMap)) index := 0 + // TODO(helin): write checkpoint incrementally to reduce memory + // footprint during checkpoint. for name, opt := range s.optMap { var pc parameterCheckpoint pc.Param.Name = name pc.Param.ElementType = opt.elementType pc.Param.Content = opt.GetWeights() + pc.Config = opt.config pc.State = opt.GetStates() cp[index] = pc index++ } + s.mu.Unlock() + var buf bytes.Buffer encoder := gob.NewEncoder(&buf) err = encoder.Encode(cp) @@ -264,32 +297,9 @@ func (s *Service) doCheckpoint() (err error) { return } - cpMeta := checkpointMeta{} - cpMeta.UUID = s.checkpointPath + strconv.Itoa(s.idx) - cpMeta.Timestamp = time.Now().UnixNano() - h := md5.New() - cpMeta.MD5 = hex.EncodeToString(h.Sum(buf.Bytes())) - - cpMetajson, err := json.Marshal(cpMeta) - if err != nil { - return - } - - err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3*time.Second) - if err != nil { - return - } - if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) { - log.Info("checkpoint does not exists.") - } else { - err = os.Remove(cpMeta.UUID) - if err != nil { - log.Infof("Removing checkpoint %s failed", cpMeta.UUID) - } else { - log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID) - } - } - f, err := os.Create(cpMeta.UUID) + id := uuid.NewV4().String() + p := path.Join(s.checkpointPath, id) + f, err := os.Create(p) if err != nil { return } @@ -317,5 +327,43 @@ func (s *Service) doCheckpoint() (err error) { return } + oldMeta, err := loadMeta(s.client, s.idx) + if err == ErrCheckpointNotFound { + log.Infoln("Do not have existing checkpoint.") + err = nil + } + + if err != nil { + return + } + + h := md5.New() + md5 := hex.EncodeToString(h.Sum(buf.Bytes())) + cpMeta := checkpointMeta{ + UUID: id, + Timestamp: time.Now().UnixNano(), + MD5: md5, + Path: p, + } + + json, err := json.Marshal(cpMeta) + if err != nil { + return + } + + err = s.client.PutKey(PsCheckpoint+strconv.Itoa(s.idx), json, 3*time.Second, false) + if err != nil { + return + } + + if oldMeta.Path != "" { + rmErr := os.Remove(oldMeta.Path) + if rmErr != nil { + // log error, but still treat checkpoint as + // successful. + log.Errorln(rmErr) + } + } + return } diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go index 988f3b5acb82a95aeb54af2b8b0e4d39a458291a..be648cd1e83e4f7790edac5842db432fb4870072 100644 --- a/go/pserver/service_test.go +++ b/go/pserver/service_test.go @@ -30,7 +30,7 @@ const ( func TestServiceFull(t *testing.T) { var cp pserver.Checkpoint - s, err := pserver.NewService(0, 1, "", nil, cp) + s, err := pserver.NewService(0, time.Hour, "", nil, cp) if err != nil { t.Error(err) } @@ -102,7 +102,7 @@ func TestServiceFull(t *testing.T) { func TestMultipleInit(t *testing.T) { var cp pserver.Checkpoint - s, err := pserver.NewService(0, 1, "", nil, cp) + s, err := pserver.NewService(0, time.Hour, "", nil, cp) if err != nil { t.Fatal(err) } @@ -119,7 +119,7 @@ func TestMultipleInit(t *testing.T) { func TestUninitialized(t *testing.T) { var cp pserver.Checkpoint - s, err := pserver.NewService(0, 1, "", nil, cp) + s, err := pserver.NewService(0, time.Hour, "", nil, cp) err = s.SendGrad(pserver.Gradient{}, nil) if err.Error() != pserver.Uninitialized { t.Fatal(err) @@ -128,7 +128,7 @@ func TestUninitialized(t *testing.T) { func TestBlockUntilInitialized(t *testing.T) { var cp pserver.Checkpoint - s, err := pserver.NewService(0, 1, "", nil, cp) + s, err := pserver.NewService(0, time.Hour, "", nil, cp) if err != nil { t.Error(err) } diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh index fa7baccc86e0b56e57d52a40c95cfe1b98fececc..8fd58925ee4820269572176ff9496f42914652da 100755 --- a/paddle/.set_python_path.sh +++ b/paddle/.set_python_path.sh @@ -21,22 +21,15 @@ # # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...} # - -if ! python -c "import paddle" >/dev/null 2>/dev/null; then - PYPATH="" - set -x - while getopts "d:" opt; do - case $opt in - d) - PYPATH=$OPTARG - ;; - esac - done - shift $(($OPTIND - 1)) - export PYTHONPATH=$PYPATH:$PYTHONPATH - $@ -else - echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment." - echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'" - exit 1 -fi +PYPATH="" +set -x +while getopts "d:" opt; do + case $opt in + d) + PYPATH=$OPTARG + ;; + esac +done +shift $(($OPTIND - 1)) +export PYTHONPATH=$PYPATH:$PYTHONPATH +$@ diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 4b06966fba2bc9f92756be0cb8110bbcd5272423..cf61a243e9df2fd4a580e41f07cb0a22dcc72083 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -15,7 +15,6 @@ if(Boost_FOUND) add_subdirectory(platform) add_subdirectory(framework) add_subdirectory(operators) - add_subdirectory(pybind) endif() if(WITH_C_API) @@ -23,7 +22,5 @@ if(WITH_C_API) endif() if(WITH_SWIG_PY) - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in - ${CMAKE_CURRENT_SOURCE_DIR}/setup.py) add_subdirectory(api) endif() diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt index 84da89a1422b6095b995744cebb6a3af98a071c6..7a1e8b8b26ac6330c3799b7dfeb4447e171fe0f1 100644 --- a/paddle/api/CMakeLists.txt +++ b/paddle/api/CMakeLists.txt @@ -82,9 +82,7 @@ SWIG_LINK_LIBRARIES(swig_paddle add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle - COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel - COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp - COMMAND rm -rf py_paddle.egg-info build + COMMAND ${CMAKE_COMMAND} -E touch .timestamp WORKING_DIRECTORY ${PROJ_ROOT}/paddle DEPENDS _swig_paddle ) @@ -92,10 +90,6 @@ add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so # TODO(yuyang18) : make wheel name calculated by cmake add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so) -install(DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/dist/ - DESTINATION opt/paddle/share/wheels -) - if(WITH_TESTING) IF(NOT PY_PIP_FOUND) SET(PIP_SOURCES_DIR ${PYTHON_SOURCES_DIR}/pip) @@ -108,7 +102,7 @@ if(WITH_TESTING) BUILD_COMMAND "" INSTALL_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install BUILD_IN_SOURCE 1 - DEPENDS python setuptools python_api_wheel + #DEPENDS python setuptools python_api_wheel ) ENDIF() add_subdirectory(test) diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt index f3b1c2c4d438b5d3e776ef27ce8f8b78f710f2ab..761aeb5b174105edece8880a9f5012c13a63fd11 100644 --- a/paddle/api/test/CMakeLists.txt +++ b/paddle/api/test/CMakeLists.txt @@ -1,2 +1,6 @@ -add_python_test(test_swig_api - testArguments.py testGradientMachine.py testMatrix.py testVector.py testTrain.py testTrainer.py) +py_test(testTrain SRCS testTrain.py) +py_test(testMatrix SRCS testMatrix.py) +py_test(testVector SRCS testVector.py) +py_test(testTrainer SRCS testTrainer.py) +py_test(testArguments SRCS testArguments.py) +py_test(testGradientMachine SRCS testGradientMachine.py) diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt index 73ffa690d9d91b673079fc0ecf91f17cbabfdb1e..0865b02c4f275f3d5069109917b05dff1393fc1e 100755 --- a/paddle/cuda/CMakeLists.txt +++ b/paddle/cuda/CMakeLists.txt @@ -39,6 +39,7 @@ set(CUDA_CU_SOURCES src/hl_cuda_lstm.cu src/hl_top_k.cu src/hl_batch_transpose.cu + src/hl_batch_norm.cu src/hl_cuda_sequence.cu src/hl_table_apply.cu) diff --git a/paddle/cuda/include/hl_batch_norm.h b/paddle/cuda/include/hl_batch_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..afc5e0b2deacc4aadf98b3f7ce115e534bbc5124 --- /dev/null +++ b/paddle/cuda/include/hl_batch_norm.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef HL_BATCH_NORM_H_ +#define HL_BATCH_NORM_H_ + +#include "hl_base.h" + +/** + * @brief batch norm inferece. + * + * @param[in] input input data. + * @param[out] output output data. + * @param[in] scale batch normalization scale parameter (in original + * paper scale is referred to as gamma). + * @param[in] bias batch normalization bias parameter (in original + * paper scale is referred to as beta). + * @param[in] estimatedMean + * @param[in] estimatedVar The moving mean and variance + * accumulated during the training phase are passed + * as inputs here. + * @param[in] epsilon Epsilon value used in the batch + * normalization formula. + */ +extern void hl_batch_norm_cuda_inference(const real* input, + real* output, + const real* scale, + const real* bias, + const real* estimatedMean, + const real* estimatedVar, + const double epsilon, + size_t batchSize, + size_t channel, + size_t height, + size_t width); + +#endif // HL_BATCH_NORM_H_ diff --git a/paddle/cuda/src/hl_batch_norm.cu b/paddle/cuda/src/hl_batch_norm.cu new file mode 100644 index 0000000000000000000000000000000000000000..5828ecb8e049c2f0573ab8547164794bef6db1ca --- /dev/null +++ b/paddle/cuda/src/hl_batch_norm.cu @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "hl_batch_norm.h" + +__global__ void batchNormInference(real* output, + const real* input, + const real* scale, + const real* bias, + const real* estimatedMean, + const real* estimatedVar, + const double epsilon, + size_t batchSize, + size_t channel, + size_t height, + size_t width) { + const int tid = threadIdx.x; + const int num = channel * height * width; + const int batch = blockIdx.x; + for (int i = tid; i < num; i += blockDim.x) { + const int c = i / (height * width); + const int id = batch * num + i; + real val = input[id] - estimatedMean[c]; + val /= sqrt(estimatedVar[c] + epsilon); + val *= scale[c]; + val += bias[c]; + output[id] = val; + } +} + +void hl_batch_norm_cuda_inference(const real* input, + real* output, + const real* scale, + const real* bias, + const real* estimatedMean, + const real* estimatedVar, + const double epsilon, + size_t batchSize, + size_t channel, + size_t height, + size_t width) { + batchNormInference<<>>(output, + input, + scale, + bias, + estimatedMean, + estimatedVar, + epsilon, + batchSize, + channel, + height, + width); + + CHECK_SYNC("hl_batch_norm_cuda_inference failed!"); +} diff --git a/paddle/cuda/src/hl_batch_transpose.cu b/paddle/cuda/src/hl_batch_transpose.cu index f047403da17e66960f029f2fee7312210009c952..f4c253df7b4be937f041f18587efd4c9d693fbe4 100644 --- a/paddle/cuda/src/hl_batch_transpose.cu +++ b/paddle/cuda/src/hl_batch_transpose.cu @@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_batch_transpose.h" #include "hl_base.h" +#include "hl_batch_transpose.h" const int TILE_DIM = 64; const int BLOCK_ROWS = 16; // No bank-conflict transpose for a batch of data. -__global__ void batchTransposeNoBankConflicts(real* odata, - const real* idata, - int numSamples, int width, - int height) { +__global__ void batchTransposeNoBankConflicts( + real* odata, const real* idata, int numSamples, int width, int height) { __shared__ float tile[TILE_DIM][TILE_DIM + 1]; const int x = blockIdx.x * TILE_DIM + threadIdx.x; @@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata, newX] = tile[threadIdx.x][j]; } -void batchTranspose(const real* input, real* output, int width, int height, - int batchSize) { +void batchTranspose( + const real* input, real* output, int width, int height, int batchSize) { dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize); - batchTransposeNoBankConflicts<<>> - (output, input, batchSize, width, height); + batchTransposeNoBankConflicts<<>>( + output, input, batchSize, width, height); CHECK_SYNC("batchTranspose failed!"); } diff --git a/paddle/cuda/src/hl_cuda_aggregate.cu b/paddle/cuda/src/hl_cuda_aggregate.cu index 97034a917708487d1c5dc59e6ebbf45bad1c3227..16a54ad343fa140aa1f3bec311c4b712d0086082 100644 --- a/paddle/cuda/src/hl_cuda_aggregate.cu +++ b/paddle/cuda/src/hl_cuda_aggregate.cu @@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - +#include "hl_aggregate.h" #include "hl_base.h" #include "hl_cuda.h" #include "hl_cuda.ph" -#include "hl_aggregate.h" -#include "hl_thread.ph" #include "hl_matrix_base.cuh" +#include "hl_thread.ph" #include "paddle/utils/Logging.h" /** * @brief matrix row operator. */ -template -__global__ void KeMatrixRowOp(Agg agg, - real *E, - real *Sum, - int dimN) { +template +__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) { __shared__ real sum_s[blockSize]; - int cnt = (dimN + blockSize -1) / blockSize; - int rowId = blockIdx.x + blockIdx.y*gridDim.x; - int index = rowId*dimN; + int cnt = (dimN + blockSize - 1) / blockSize; + int rowId = blockIdx.x + blockIdx.y * gridDim.x; + int index = rowId * dimN; int tid = threadIdx.x; int lmt = tid; @@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg, sum_s[tid] = tmp; __syncthreads(); - for (int stride = blockSize/2; stride > 0; stride = stride/2) { + for (int stride = blockSize / 2; stride > 0; stride = stride / 2) { if (tid < stride) { sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]); } @@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg, } template -void hl_matrix_row_op(Agg agg, - real *A_d, - real *C_d, - int dimM, - int dimN) { +void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) { int blocksX = dimM; int blocksY = 1; dim3 threads(128, 1); dim3 grid(blocksX, blocksY); - KeMatrixRowOp<<< grid, threads, 0, STREAM_DEFAULT >>> - (agg, A_d, C_d, dimN); + KeMatrixRowOp<<>>( + agg, A_d, C_d, dimN); } void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_row_op(aggregate::sum(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_row_sum failed"); } @@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_row_op(aggregate::max(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_row_max failed"); } @@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_row_op(aggregate::min(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_row_min failed"); } /** * @brief matrix column operator. */ -template -__global__ void KeMatrixColumnOp(Agg agg, - real *E, - real *Sum, - int dimM, - int dimN) { +template +__global__ void KeMatrixColumnOp( + Agg agg, real *E, real *Sum, int dimM, int dimN) { int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; real tmp = agg.init(); if (rowIdx < dimN) { @@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg, } } -template -__global__ void KeMatrixColumnOp_S(Agg agg, - real *E, - real *Sum, - int dimM, - int dimN) { - __shared__ real _sum[blockDimX*blockDimY]; - int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; - int index = threadIdx.y; +template +__global__ void KeMatrixColumnOp_S( + Agg agg, real *E, real *Sum, int dimM, int dimN) { + __shared__ real _sum[blockDimX * blockDimY]; + int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; + int index = threadIdx.y; real tmp = agg.init(); if (rowIdx < dimN) { @@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg, index += blockDimY; } } - _sum[threadIdx.x + threadIdx.y*blockDimX] = tmp; + _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp; __syncthreads(); if (rowIdx < dimN) { - if (threadIdx.y ==0) { + if (threadIdx.y == 0) { real tmp = agg.init(); - for (int i=0; i < blockDimY; i++) { - tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]); + for (int i = 0; i < blockDimY; i++) { + tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]); } Sum[rowIdx] = tmp; } @@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg, } template -void hl_matrix_column_op(Agg agg, - real *A_d, - real *C_d, - int dimM, - int dimN) { +void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) { if (dimN >= 8192) { - int blocksX = (dimN + 128 -1) / 128; + int blocksX = (dimN + 128 - 1) / 128; int blocksY = 1; dim3 threads(128, 1); dim3 grid(blocksX, blocksY); - KeMatrixColumnOp<<< grid, threads, 0, STREAM_DEFAULT >>> - (agg, A_d, C_d, dimM, dimN); + KeMatrixColumnOp<<>>( + agg, A_d, C_d, dimM, dimN); } else { - int blocksX = (dimN + 32 -1) / 32; + int blocksX = (dimN + 32 - 1) / 32; int blocksY = 1; dim3 threads(32, 32); dim3 grid(blocksX, blocksY); - KeMatrixColumnOp_S<<< grid, threads, 0, STREAM_DEFAULT>>> - (agg, A_d, C_d, dimM, dimN); + KeMatrixColumnOp_S<<>>( + agg, A_d, C_d, dimM, dimN); } return; @@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_column_op(aggregate::sum(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_column_sum failed"); } @@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_column_op(aggregate::max(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_column_max failed"); } @@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_column_op(aggregate::min(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_column_min failed"); } @@ -226,16 +184,16 @@ template __global__ void KeVectorSum(real *E, real *Sum, int dimM) { __shared__ double sum_s[blockSize]; int tid = threadIdx.x; - int index = blockIdx.y*blockDim.x+threadIdx.x; + int index = blockIdx.y * blockDim.x + threadIdx.x; sum_s[tid] = 0.0f; while (index < dimM) { sum_s[tid] += E[index]; - index += blockDim.x*gridDim.y; + index += blockDim.x * gridDim.y; } __syncthreads(); - for (int stride = blockSize/2; stride > 0; stride = stride/2) { + for (int stride = blockSize / 2; stride > 0; stride = stride / 2) { if (tid < stride) { sum_s[tid] += sum_s[tid + stride]; } @@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) { dim3 threads(blockSize, 1); dim3 grid(blocksX, blocksY); - struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; + struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; hl_event_t hl_event = &hl_event_st; - while (!hl_cuda_event_is_ready(hl_event)) {} + while (!hl_cuda_event_is_ready(hl_event)) { + } - KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> - (A_d, t_resource.gpu_mem, dimM); - KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> - (t_resource.gpu_mem, t_resource.cpu_mem, 128); + KeVectorSum<128><<>>( + A_d, t_resource.gpu_mem, dimM); + KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>( + t_resource.gpu_mem, t_resource.cpu_mem, 128); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_synchronize(HPPL_STREAM_DEFAULT); cudaError_t err = (cudaError_t)hl_get_device_last_error(); - CHECK_EQ(cudaSuccess, err) - << "CUDA error: " << hl_get_device_error_string((size_t)err); + CHECK_EQ(cudaSuccess, err) << "CUDA error: " + << hl_get_device_error_string((size_t)err); } template __global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) { __shared__ double sum_s[blockSize]; int tid = threadIdx.x; - int index = blockIdx.y*blockDim.x+threadIdx.x; + int index = blockIdx.y * blockDim.x + threadIdx.x; sum_s[tid] = 0.0f; while (index < dimM) { sum_s[tid] += abs(E[index]); - index += blockDim.x*gridDim.y; + index += blockDim.x * gridDim.y; } __syncthreads(); - for (int stride = blockSize/2; stride > 0; stride = stride/2) { + for (int stride = blockSize / 2; stride > 0; stride = stride / 2) { if (tid < stride) { sum_s[tid] += sum_s[tid + stride]; } @@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) { dim3 threads(blockSize, 1); dim3 grid(blocksX, blocksY); - struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; + struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; hl_event_t hl_event = &hl_event_st; - while (!hl_cuda_event_is_ready(hl_event)) {} + while (!hl_cuda_event_is_ready(hl_event)) { + } - KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> - (A_d, t_resource.gpu_mem, dimM); - KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> - (t_resource.gpu_mem, t_resource.cpu_mem, 128); + KeVectorAbsSum<128><<>>( + A_d, t_resource.gpu_mem, dimM); + KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>( + t_resource.gpu_mem, t_resource.cpu_mem, 128); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_synchronize(HPPL_STREAM_DEFAULT); cudaError_t err = (cudaError_t)hl_get_device_last_error(); - CHECK_EQ(cudaSuccess, err) - << "CUDA error: " << hl_get_device_error_string((size_t)err); + CHECK_EQ(cudaSuccess, err) << "CUDA error: " + << hl_get_device_error_string((size_t)err); } diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu index b6e3e63a4f52261e49467bd82fdabd063e81460e..aac19b1ea566ad69f1f7374e393676c8debd9883 100644 --- a/paddle/cuda/src/hl_cuda_cnn.cu +++ b/paddle/cuda/src/hl_cuda_cnn.cu @@ -12,21 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include "hl_base.h" #include "hl_cnn.h" #include "hl_device_functions.cuh" -__global__ void KeMaxPoolForward(const int nthreads, const real* inputData, - const int channels, const int height, +__global__ void KeMaxPoolForward(const int nthreads, + const real* inputData, + const int channels, + const int height, const int width, - const int pooledH, const int pooledW, - const int ksizeW, const int ksizeH, - const int strideH, const int strideW, - const int offsetH, const int offsetW, - real* tgtData, const int tgtStride) { - int index = blockIdx.x * blockDim.x + threadIdx.x; + const int pooledH, + const int pooledW, + const int ksizeW, + const int ksizeH, + const int strideH, + const int strideW, + const int offsetH, + const int offsetW, + real* tgtData, + const int tgtStride) { + int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { int pw = index % pooledW; int ph = (index / pooledW) % pooledH; @@ -46,44 +52,70 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData, maxval = inputData[h * width + w]; } } - int tgtIndex = index % (pooledW * pooledH * channels) + - frameNum * tgtStride; + int tgtIndex = + index % (pooledW * pooledH * channels) + frameNum * tgtStride; tgtData[tgtIndex] = maxval; } } -void hl_maxpool_forward(const int frameCnt, const real* inputData, +void hl_maxpool_forward(const int frameCnt, + const real* inputData, const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real* tgtData, const int tgtStride) { - + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real* tgtData, + const int tgtStride) { int num_kernels = pooledH * pooledW * channels * frameCnt; int blocks = (num_kernels + 1024 - 1) / 1024; dim3 threads(1024, 1); dim3 grid(blocks, 1); - KeMaxPoolForward<<< grid, threads, 0, STREAM_DEFAULT >>> - (num_kernels, inputData, channels, height, width, - pooledH, pooledW, sizeX, sizeY, strideH, strideW, - paddingH, paddingW, tgtData, tgtStride); + KeMaxPoolForward<<>>(num_kernels, + inputData, + channels, + height, + width, + pooledH, + pooledW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + tgtData, + tgtStride); CHECK_SYNC("hl_maxpool_forward failed"); } -__global__ void KeMaxPoolBackward(const int nthreads, const real* inputData, - const real* outData, const real* outGrad, - const int channels, const int height, +__global__ void KeMaxPoolBackward(const int nthreads, + const real* inputData, + const real* outData, + const real* outGrad, + const int channels, + const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int padH, const int padW, - real scaleA, real scaleB, - real* targetGrad, const int outStride) { - int index = blockIdx.x * blockDim.x + threadIdx.x; + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int padH, + const int padW, + real scaleA, + real scaleB, + real* targetGrad, + const int outStride) { + int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { // find out the local index // find out the local offset @@ -107,43 +139,69 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData, } } } - targetGrad[index] = - scaleB * targetGrad[index] + scaleA * gradient; + targetGrad[index] = scaleB * targetGrad[index] + scaleA * gradient; } } -void hl_maxpool_backward(const int frameCnt, const real* inputData, - const real* outData, const real* outGrad, - const int channels, const int height, - const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real scaleA, real scaleB, - real* targetGrad, const int outStride) { - +void hl_maxpool_backward(const int frameCnt, + const real* inputData, + const real* outData, + const real* outGrad, + const int channels, + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real scaleA, + real scaleB, + real* targetGrad, + const int outStride) { int num_kernels = height * width * channels * frameCnt; int blocks = (num_kernels + 1024 - 1) / 1024; - KeMaxPoolBackward<<< blocks, 1024, 0, STREAM_DEFAULT >>> - (num_kernels, inputData, outData, outGrad, channels, - height, width, pooledH, pooledW, sizeX, sizeY, - strideH, strideW, - paddingH, paddingW, - scaleA, scaleB, - targetGrad, outStride); + KeMaxPoolBackward<<>>(num_kernels, + inputData, + outData, + outGrad, + channels, + height, + width, + pooledH, + pooledW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + scaleA, + scaleB, + targetGrad, + outStride); CHECK_SYNC("hl_maxpool_backward"); } -__global__ void KeAvgPoolForward(const int nthreads, const real* inputData, +__global__ void KeAvgPoolForward(const int nthreads, + const real* inputData, const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int padH, const int padW, - real* tgtData, const int tgtStride) { + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int padH, + const int padW, + real* tgtData, + const int tgtStride) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { int pw = index % pooledW; @@ -168,39 +226,64 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData, aveval += inputData[h * width + w]; } } - int tgtIndex = index % (pooledW * pooledH * channels) + - frameNum * tgtStride; + int tgtIndex = + index % (pooledW * pooledH * channels) + frameNum * tgtStride; tgtData[tgtIndex] = aveval / pool_size; } } -void hl_avgpool_forward(const int frameCnt, const real* inputData, +void hl_avgpool_forward(const int frameCnt, + const real* inputData, const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real* tgtData, const int tgtStride) { + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real* tgtData, + const int tgtStride) { int num_kernels = pooledH * pooledW * channels * frameCnt; int blocks = (num_kernels + 1024 - 1) / 1024; - KeAvgPoolForward<<< blocks, 1024, 0, STREAM_DEFAULT >>> - (num_kernels, inputData, channels, - height, width, pooledH, pooledW, - sizeX, sizeY, strideH, strideW, - paddingH, paddingW, tgtData, tgtStride); + KeAvgPoolForward<<>>(num_kernels, + inputData, + channels, + height, + width, + pooledH, + pooledW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + tgtData, + tgtStride); CHECK_SYNC("hl_avgpool_forward failed"); } -__global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad, - const int channels, const int height, +__global__ void KeAvgPoolBackward(const int nthreads, + const real* outGrad, + const int channels, + const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int padH, const int padW, - real scaleA, real scaleB, - real* tgtGrad, const int outStride) { + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int padH, + const int padW, + real scaleA, + real scaleB, + real* tgtGrad, + const int outStride) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { int offsetW = index % width + padW; @@ -215,7 +298,6 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad, real gradient = 0; outGrad += (frameNum * outStride + offsetC * pooledH * pooledW); - for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { // figure out the pooling size @@ -224,32 +306,50 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad, int hend = min(hstart + sizeY, height + padH); int wend = min(wstart + sizeX, width + padW); int poolsize = (hend - hstart) * (wend - wstart); - gradient += outGrad[ph * pooledW + pw]/poolsize; + gradient += outGrad[ph * pooledW + pw] / poolsize; } } tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient; } } -void hl_avgpool_backward(const int frameCnt, const real* outGrad, +void hl_avgpool_backward(const int frameCnt, + const real* outGrad, const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real scaleA, real scaleB, - real* backGrad, const int outStride) { + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real scaleA, + real scaleB, + real* backGrad, + const int outStride) { int num_kernels = height * width * channels * frameCnt; int blocks = (num_kernels + 1024 - 1) / 1024; - KeAvgPoolBackward <<< blocks, 1024, 0, STREAM_DEFAULT >>> - (num_kernels, outGrad, channels, height, width, - pooledH, pooledW, sizeX, sizeY, - strideH, strideW, - paddingH, paddingW, - scaleA, scaleB, - backGrad, outStride); + KeAvgPoolBackward<<>>(num_kernels, + outGrad, + channels, + height, + width, + pooledH, + pooledW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + scaleA, + scaleB, + backGrad, + outStride); CHECK_SYNC("hl_avgpool_backward failed"); } @@ -266,7 +366,7 @@ __global__ void KeBilinearInterpFw(const real* in, const size_t numChannels, const real ratioH, const real ratioW) { - int nthreads = outputH * outputW; + int nthreads = outputH * outputW; int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < nthreads) { int outIdH = tid / outputW; @@ -287,13 +387,14 @@ __global__ void KeBilinearInterpFw(const real* in, real w1lambda = ratioW * outImgIdx - inImgIdx; real w2lambda = 1.f - w1lambda; - const real* inPos = - &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx]; + const real* inPos = &in[outIdH * inputW + channelId * inImgSize + + inImgIdy * inImgW + inImgIdx]; // bilinear interpolation out[outIdH * outputW + outIdW] = - h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) + - h1lambda * (w2lambda * inPos[hId * inImgW] + w1lambda * inPos[hId * inImgW + wId]); + h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) + + h1lambda * (w2lambda * inPos[hId * inImgW] + + w1lambda * inPos[hId * inImgW + wId]); } } @@ -313,9 +414,19 @@ void hl_bilinear_forward(const real* inData, int threadNum = outputH * outputW; int blocks = (threadNum + 1024 - 1) / 1024; - KeBilinearInterpFw<<< blocks, 1024, 0, STREAM_DEFAULT>>>( - inData, inImgH, inImgW, inputH, inputW, outData, outImgH, - outImgW, outputH, outputW, numChannels, ratioH, ratioW); + KeBilinearInterpFw<<>>(inData, + inImgH, + inImgW, + inputH, + inputW, + outData, + outImgH, + outImgW, + outputH, + outputW, + numChannels, + ratioH, + ratioW); CHECK_SYNC("hl_bilinear_forward failed"); } @@ -353,13 +464,15 @@ __global__ void KeBilinearInterpBw(real* in, real w1lambda = ratioW * outImgIdx - inImgIdx; real w2lambda = 1.f - w1lambda; - real* inPos = - &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx]; + real* inPos = &in[outIdH * inputW + channelId * inImgSize + + inImgIdy * inImgW + inImgIdx]; const real* outPos = &out[outIdH * outputW + outIdW]; paddle::paddleAtomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]); paddle::paddleAtomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]); - paddle::paddleAtomicAdd(&inPos[hId * inImgW], h1lambda * w2lambda * outPos[0]); - paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId], h1lambda * w1lambda * outPos[0]); + paddle::paddleAtomicAdd(&inPos[hId * inImgW], + h1lambda * w2lambda * outPos[0]); + paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId], + h1lambda * w1lambda * outPos[0]); } } @@ -379,22 +492,37 @@ void hl_bilinear_backward(real* inGrad, int threadNum = outputH * outputW; int blocks = (threadNum + 1024 - 1) / 1024; - KeBilinearInterpBw<<< blocks, 1024, 0, STREAM_DEFAULT>>>( - inGrad, inImgH, inImgW, inputH, inputW, outGrad, outImgH, - outImgW, outputH, outputW, numChannels, ratioH, ratioW); + KeBilinearInterpBw<<>>(inGrad, + inImgH, + inImgW, + inputH, + inputW, + outGrad, + outImgH, + outImgW, + outputH, + outputW, + numChannels, + ratioH, + ratioW); CHECK_SYNC("hl_bilinear_backward failed"); } -__global__ void maxoutFpCompute(size_t nthreads, const real * inData, - real * outData, int* idData, - size_t size, size_t featLen, size_t groups) { +__global__ void maxoutFpCompute(size_t nthreads, + const real* inData, + real* outData, + int* idData, + size_t size, + size_t featLen, + size_t groups) { int index = blockIdx.x * blockDim.x + threadIdx.x; - if(index < nthreads) { + if (index < nthreads) { size_t batch_idx = index / size; size_t i = index % size; size_t channel_idx = i / featLen; size_t feat_idx = i % featLen; - size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx; + size_t data_idx = + (batch_idx * size + channel_idx * featLen) * groups + feat_idx; real max = inData[data_idx]; int maxId = 0; for (size_t g = 1; g < groups; ++g) { @@ -409,37 +537,50 @@ __global__ void maxoutFpCompute(size_t nthreads, const real * inData, } } -void hl_maxout_forward(const real* inData, real* outData, - int* idData, size_t batchSize, size_t size, - size_t featLen, size_t groups) { +void hl_maxout_forward(const real* inData, + real* outData, + int* idData, + size_t batchSize, + size_t size, + size_t featLen, + size_t groups) { int num_kernels = size * batchSize; int blocks = (num_kernels + 1024 - 1) / 1024; - maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>( - num_kernels, inData, outData, idData, size, featLen, groups); + maxoutFpCompute<<>>( + num_kernels, inData, outData, idData, size, featLen, groups); CHECK_SYNC("hl_maxout_forward failed"); } -__global__ void maxoutBpCompute(size_t nthreads, real* inGrad, - const real* outGrad, const int* idData, - size_t size, size_t featLen, size_t groups) { +__global__ void maxoutBpCompute(size_t nthreads, + real* inGrad, + const real* outGrad, + const int* idData, + size_t size, + size_t featLen, + size_t groups) { int index = blockIdx.x * blockDim.x + threadIdx.x; - if(index < nthreads) { + if (index < nthreads) { size_t batch_idx = index / size; size_t i = index % size; size_t channel_idx = i / featLen; size_t feat_idx = i % featLen; size_t newIndex = batch_idx * size; - size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx; + size_t gradIdx = + (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx; (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i]; } } -void hl_maxout_backward(real* inGrad, const real* outGrad, - const int* idData, size_t batchSize, size_t size, - size_t featLen, size_t groups) { +void hl_maxout_backward(real* inGrad, + const real* outGrad, + const int* idData, + size_t batchSize, + size_t size, + size_t featLen, + size_t groups) { int num_kernels = size * batchSize; int blocks = (num_kernels + 1024 - 1) / 1024; - maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>( - num_kernels, inGrad, outGrad, idData, size, featLen, groups); + maxoutBpCompute<<>>( + num_kernels, inGrad, outGrad, idData, size, featLen, groups); CHECK_SYNC("hl_maxout_backward failed"); } diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc index c53a5636829cab9d575f58cc2326cb3efe383e1c..78642a17443b0b4d81defaa46579332ef20c71a1 100644 --- a/paddle/cuda/src/hl_cuda_cudnn.cc +++ b/paddle/cuda/src/hl_cuda_cudnn.cc @@ -1022,6 +1022,7 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc, real alpha = 1.0f; real beta = 1.0f; cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL; + CHECK_CUDNN( dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle, mode, diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu index b869d903ba3cfb188f823518ba8ee7d17f9b2440..a5ce81a904ebbd655a16ef68660b81d442478575 100644 --- a/paddle/cuda/src/hl_cuda_lstm.cu +++ b/paddle/cuda/src/hl_cuda_lstm.cu @@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - +#include "hl_activation_functions.h" #include "hl_base.h" #include "hl_cuda_cublas.h" #include "hl_device_functions.cuh" -#include "hl_activation_functions.h" #include "paddle/utils/Logging.h" -typedef hppl::Active::forward t_forward; +typedef hppl::Active::forward t_forward; typedef hppl::Active::backward t_backward; bool hl_lstm_sequence_parallel(int frameSize) { @@ -42,9 +41,9 @@ public: value_ += (start + length - 1) * frameSize + idx; } } - __device__ inline real *getPtr() const {return value_;} - __device__ inline real getValue() {return *value_;} - __device__ inline void setValue(real value) {*value_ = value;} + __device__ inline real *getPtr() const { return value_; } + __device__ inline real getValue() { return *value_; } + __device__ inline void setValue(real value) { *value_ = value; } template __device__ inline void nextFrame() { if (reversed == 0) { @@ -55,28 +54,25 @@ public: } }; -__device__ __forceinline__ -void ptx_sync(const int id, const int barriers) { +__device__ __forceinline__ void ptx_sync(const int id, const int barriers) { asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory"); } -__device__ __forceinline__ -void ptx_arrive(const int id, const int barriers) { +__device__ __forceinline__ void ptx_arrive(const int id, const int barriers) { asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory"); } -template -__device__ __forceinline__ real -forward_sequence(real value, - real *shValue, - real *state, - real *preOutput, - real *output, - real check, - int index, - t_forward activeNode, - t_forward activeGate, - t_forward activeState) { +template +__device__ __forceinline__ real forward_sequence(real value, + real *shValue, + real *state, + real *preOutput, + real *output, + real check, + int index, + t_forward activeNode, + t_forward activeGate, + t_forward activeState) { real out; real prevOut; real state_r; @@ -112,17 +108,20 @@ forward_sequence(real value, if (idy == 0) { ptx_sync(2, frameSize * 2); prevOut = state[idx]; - prevOut = activeState(prevOut); + prevOut = activeState(prevOut); preOutput[idx] = prevOut; ptx_arrive(3, frameSize * 2); } return value; } -#define OUTPUT_BARRIER_ID 10 -#define OUTPUT_BARRIER_ID2 11 -template +#define OUTPUT_BARRIER_ID 10 +#define OUTPUT_BARRIER_ID2 11 +template __global__ void KeLstmForward(real *gateValue, real *state, real *output, @@ -184,10 +183,16 @@ __global__ void KeLstmForward(real *gateValue, } } value = forward_sequence( - value, shValue, shState, shPrevOutput, shOutput, check, index, - hppl::gpu::forward[active_node], - hppl::gpu::forward[active_gate], - hppl::gpu::forward[active_state]); + value, + shValue, + shState, + shPrevOutput, + shOutput, + check, + index, + hppl::gpu::forward[active_node], + hppl::gpu::forward[active_gate], + hppl::gpu::forward[active_state]); const int idx = index % frameSize; const int idy = index / frameSize; if (valueSize == 128) { @@ -218,7 +223,7 @@ __global__ void KeLstmForward(real *gateValue, real B_r[frameSize]; const int computeIdx = index - valueSize; if (i == 0) { - #pragma unroll +#pragma unroll for (int n = 0; n < frameSize; n++) { B_r[n] = weight[n * valueSize + computeIdx]; } @@ -230,7 +235,7 @@ __global__ void KeLstmForward(real *gateValue, } real sum = 0.0f; for (int n = 0; n < frameSize; n++) { - sum += A_r[n]*B_r[n]; + sum += A_r[n] * B_r[n]; } shValue[computeIdx] = sum; ptx_arrive(OUTPUT_BARRIER_ID2, blockSize); @@ -239,14 +244,14 @@ __global__ void KeLstmForward(real *gateValue, if (valueSize == 256) { real B_r[frameSize]; if (i == 0) { - #pragma unroll +#pragma unroll for (int n = 0; n < frameSize; n++) { B_r[n] = weight[n * valueSize + index]; } } real sum = 0.0f; for (int n = 0; n < frameSize; n++) { - sum += shOutput[n]*B_r[n]; + sum += shOutput[n] * B_r[n]; } value += sum; } @@ -273,50 +278,81 @@ void hl_lstm_parallel_forward(real *gateValue, dim3 grid(numSequences, 1); if (!reversed) { if (frameSize == 32) { - KeLstmForward<128, 32, 0, 128, 256> - <<>> - (gateValue, stateValue, outputValue, preOutputValue, - checkIg, checkFg, checkOg, weight, sequence, - active_node, active_gate, active_state); + KeLstmForward<128, 32, 0, 128, 256><<>>( + gateValue, + stateValue, + outputValue, + preOutputValue, + checkIg, + checkFg, + checkOg, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 64) { - KeLstmForward<256, 64, 0, 256, 256> - <<>> - (gateValue, stateValue, outputValue, preOutputValue, - checkIg, checkFg, checkOg, weight, sequence, - active_node, active_gate, active_state); + KeLstmForward<256, 64, 0, 256, 256><<>>( + gateValue, + stateValue, + outputValue, + preOutputValue, + checkIg, + checkFg, + checkOg, + weight, + sequence, + active_node, + active_gate, + active_state); } } else { if (frameSize == 32) { - KeLstmForward<128, 32, 1, 128, 256> - <<>> - (gateValue, stateValue, outputValue, preOutputValue, - checkIg, checkFg, checkOg, weight, sequence, - active_node, active_gate, active_state); + KeLstmForward<128, 32, 1, 128, 256><<>>( + gateValue, + stateValue, + outputValue, + preOutputValue, + checkIg, + checkFg, + checkOg, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 64) { - KeLstmForward<256, 64, 1, 256, 256> - <<>> - (gateValue, stateValue, outputValue, preOutputValue, - checkIg, checkFg, checkOg, weight, sequence, - active_node, active_gate, active_state); + KeLstmForward<256, 64, 1, 256, 256><<>>( + gateValue, + stateValue, + outputValue, + preOutputValue, + checkIg, + checkFg, + checkOg, + weight, + sequence, + active_node, + active_gate, + active_state); } } CHECK_SYNC("hl_lstm_parallel_forward failed"); } -__device__ __forceinline__ -void transpose_32x32(real a[], const int idx) { +__device__ __forceinline__ void transpose_32x32(real a[], const int idx) { int addr = idx % 32; - #pragma unroll +#pragma unroll for (int k = 1; k < 32; k++) { // rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32); addr = __shfl(addr, (idx + 1) % 32, 32); a[k] = __shfl(a[k], addr, 32); } - #pragma unroll +#pragma unroll for (int tid = 0; tid < 31; tid++) { real tmp = (idx > tid) ? a[0] : a[1]; - #pragma unroll +#pragma unroll for (int k = 31; k > 0; k--) { a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32]; } @@ -324,29 +360,28 @@ void transpose_32x32(real a[], const int idx) { } addr = (32 - idx) % 32; - #pragma unroll +#pragma unroll for (int k = 0; k < 32; k++) { a[k] = __shfl(a[k], addr, 32); addr = __shfl(addr, (idx + 31) % 32, 32); } } -template -__device__ void -backward_sequence(real rGateValue, - real rOutputGrad, - real rPreOutputValue, - real &rGateGrad, - real &rStateGrad, - real *shStateGrad, - real *shStateValue, - real *shGateValue, - real rCheck, - real &rGateValuePrev, - int index, - t_backward activeNode, - t_backward activeGate, - t_backward activeState) { +template +__device__ void backward_sequence(real rGateValue, + real rOutputGrad, + real rPreOutputValue, + real &rGateGrad, + real &rStateGrad, + real *shStateGrad, + real *shStateValue, + real *shGateValue, + real rCheck, + real &rGateValuePrev, + int index, + t_backward activeNode, + t_backward activeGate, + t_backward activeState) { const int frameIdx = index % frameSize; const int frameIdy = index / frameSize; if (frameIdy == 3) { @@ -363,8 +398,8 @@ backward_sequence(real rGateValue, rStateGrad = rGateGrad * rCheck; shStateGrad[index] = rStateGrad; ptx_sync(3, valueSize); - rStateGrad += shStateGrad[frameIdx + frameSize *2]; - rStateGrad += shStateGrad[frameIdx + frameSize *3]; + rStateGrad += shStateGrad[frameIdx + frameSize * 2]; + rStateGrad += shStateGrad[frameIdx + frameSize * 3]; rGateGrad = rStateGrad * shGateValue[frameIdx]; rGateGrad = activeGate(rGateGrad, rGateValue); } else if (frameIdy == 2) { @@ -373,7 +408,7 @@ backward_sequence(real rGateValue, shStateGrad[index] = rStateGrad; ptx_sync(3, valueSize); rStateGrad += shStateGrad[frameIdx + frameSize]; - rStateGrad += shStateGrad[frameIdx + frameSize *3]; + rStateGrad += shStateGrad[frameIdx + frameSize * 3]; rGateValuePrev = rGateValue; rGateGrad = rStateGrad * shStateValue[frameIdx]; rGateGrad = activeGate(rGateGrad, rGateValue); @@ -381,43 +416,43 @@ backward_sequence(real rGateValue, shGateValue[frameIdx] = rGateValue; ptx_sync(3, valueSize); rStateGrad = shStateGrad[frameIdx + frameSize]; - rStateGrad += shStateGrad[frameIdx + frameSize *2]; - rStateGrad += shStateGrad[frameIdx + frameSize *3]; + rStateGrad += shStateGrad[frameIdx + frameSize * 2]; + rStateGrad += shStateGrad[frameIdx + frameSize * 3]; rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize]; rGateGrad = activeNode(rGateGrad, rGateValue); } } -template +template __device__ void load_weight(real rWeight[], real *weight, const int index) { if (valueSize == 128) { weight += index; - #pragma unroll +#pragma unroll for (int n = 0; n < frameSize; n++) { - rWeight[n] = weight[n*valueSize]; + rWeight[n] = weight[n * valueSize]; } transpose_32x32(rWeight, index % 32); } if (valueSize == 256) { int id = (index / 32) % 2; weight += index - id * 32 + id * 32 * valueSize; - #pragma unroll +#pragma unroll for (int n = 0; n < 32; n++) { - rWeight[n] = weight[n*valueSize]; - rWeight[n + 32] = weight[n*valueSize + 32]; + rWeight[n] = weight[n * valueSize]; + rWeight[n + 32] = weight[n * valueSize + 32]; } transpose_32x32(rWeight, index % 32); transpose_32x32(&rWeight[32], index % 32); } } -template +template __global__ void KeLstmBackward(real *gateValue, real *gateGrad, real *stateValue, - real *stateGrad, /* do not need save */ + real *stateGrad, /* do not need save */ real *preOutputValue, - real *preOutputGrad, /* do not need save */ + real *preOutputGrad, /* do not need save */ real *checkIg, real *checkIgGrad, real *checkFg, @@ -484,20 +519,27 @@ __global__ void KeLstmBackward(real *gateValue, for (int i = 0; i < length; ++i) { if (frameIdy == 3) { - if (i != length -1) { + if (i != length - 1) { frameStateValue.nextFrame(); shStateValue[frameIdx] = frameStateValue.getValue(); } else { shStateValue[frameIdx] = 0.0; } } - backward_sequence( - rGateValue, rOutputGrad, rPreOutputValue, rGateGrad, - rStateGrad, shStateGrad, shStateValue, shGateValue, - rCheck, rGateValuePrev, index, - hppl::gpu::backward[active_node], - hppl::gpu::backward[active_gate], - hppl::gpu::backward[active_state]); + backward_sequence(rGateValue, + rOutputGrad, + rPreOutputValue, + rGateGrad, + rStateGrad, + shStateGrad, + shStateValue, + shGateValue, + rCheck, + rGateValuePrev, + index, + hppl::gpu::backward[active_node], + hppl::gpu::backward[active_gate], + hppl::gpu::backward[active_state]); if (frameIdy == 3) { rCheckGrad += rGateGrad * rStateValue; rStateValue = shStateValue[frameIdx]; @@ -523,9 +565,9 @@ __global__ void KeLstmBackward(real *gateValue, shGateGrad[frameIdy][frameIdx] = rGateGrad; if (valueSize == 128) { real sum = 0.0f; - #pragma unroll +#pragma unroll for (int n = 0; n < frameSize; n++) { - sum += shGateGrad[frameIdy][n]*B_r[n]; + sum += shGateGrad[frameIdy][n] * B_r[n]; } if (frameIdy == 3) { rOutputGrad += sum; @@ -541,7 +583,7 @@ __global__ void KeLstmBackward(real *gateValue, } real sum = 0.0f; for (int n = 0; n < frameSize; n++) { - sum += A_r[n]*B_r[n]; + sum += A_r[n] * B_r[n]; } if (frameIdy == 3) { rOutputGrad += sum; @@ -552,8 +594,8 @@ __global__ void KeLstmBackward(real *gateValue, if (frameIdy == 3) { ptx_sync(6, valueSize); - #pragma unroll - for (int i = 0; i < 3; i ++) { +#pragma unroll + for (int i = 0; i < 3; i++) { rOutputGrad += shOutputGrad[i][frameIdx]; } } else { @@ -564,11 +606,14 @@ __global__ void KeLstmBackward(real *gateValue, /* TODO: Temporary save & merger in another kernel */ if (frameIdy == 1) { - if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad); + if (checkIgGrad) + paddle::paddleAtomicAdd(checkIgGrad + frameIdx, rCheckGrad); } else if (frameIdy == 2) { - if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad); + if (checkFgGrad) + paddle::paddleAtomicAdd(checkFgGrad + frameIdx, rCheckGrad); } else if (frameIdy == 3) { - if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad); + if (checkOgGrad) + paddle::paddleAtomicAdd(checkOgGrad + frameIdx, rCheckGrad); } } @@ -593,68 +638,183 @@ void hl_lstm_parallel_backward_data(real *gateValue, hl_activation_mode_t active_node, hl_activation_mode_t active_gate, hl_activation_mode_t active_state) { - CHECK(frameSize == 32 || frameSize == 64 || - frameSize == 128 || frameSize == 256); + CHECK(frameSize == 32 || frameSize == 64 || frameSize == 128 || + frameSize == 256); dim3 grid(numSequences, 1); if (!reversed) { if (frameSize == 32) { - KeLstmBackward<128, 32, 0><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<128, 32, 0><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 64) { - KeLstmBackward<256, 64, 0><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<256, 64, 0><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 128) { - KeLstmBackward<512, 128, 0><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<512, 128, 0><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 256) { - KeLstmBackward<1024, 256, 0><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<1024, 256, 0><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } } else { if (frameSize == 32) { - KeLstmBackward<128, 32, 1><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<128, 32, 1><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 64) { - KeLstmBackward<256, 64, 1><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<256, 64, 1><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 128) { - KeLstmBackward<512, 128, 1><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<512, 128, 1><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 256) { - KeLstmBackward<1024, 256, 1><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<1024, 256, 1><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } } CHECK_SYNC("hl_lstm_parallel_backward_data"); } -template +template __global__ void KeSetGradZero(real *gateGrad, - const int *starts, int valueSize, int numSequences, bool reversed) { + const int *starts, + int valueSize, + int numSequences, + bool reversed) { // const int tid = threadIdx.x; const int frameIdx = blockIdx.x * B_X + threadIdx.x; @@ -682,19 +842,31 @@ void hl_lstm_parallel_backward_weight(real *weightGrad, int valueSize = 4 * frameSize; dim3 threads(32, 32); dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32); - KeSetGradZero<32, 32><<>> - (gateGrad, sequence, valueSize, numSequences, reversed); + KeSetGradZero<32, 32><<>>( + gateGrad, sequence, valueSize, numSequences, reversed); if (!reversed) { hl_matrix_mul(outputValue, - HPPL_OP_T, gateGrad + valueSize, HPPL_OP_N, weightGrad, - frameSize, valueSize, batchSize - 1, - 1.0, 1.0); + HPPL_OP_T, + gateGrad + valueSize, + HPPL_OP_N, + weightGrad, + frameSize, + valueSize, + batchSize - 1, + 1.0, + 1.0); } else { hl_matrix_mul(outputValue + frameSize, - HPPL_OP_T, gateGrad, HPPL_OP_N, weightGrad, - frameSize, valueSize, batchSize - 1, - 1.0, 1.0); + HPPL_OP_T, + gateGrad, + HPPL_OP_N, + weightGrad, + frameSize, + valueSize, + batchSize - 1, + 1.0, + 1.0); } CHECK_SYNC("hl_lstm_parallel_backward_weight"); } diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu index 9bcc7fb7de44b2211db450fb164655f7947dcad9..39272456c394adc0509e60cf5972df832f7b3424 100644 --- a/paddle/cuda/src/hl_cuda_matrix.cu +++ b/paddle/cuda/src/hl_cuda_matrix.cu @@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "hl_base.h" +#include "hl_device_functions.cuh" +#include "hl_gpu_matrix_kernel.cuh" #include "hl_matrix.h" -#include "hl_matrix_ops.cuh" #include "hl_matrix_apply.cuh" +#include "hl_matrix_ops.cuh" #include "hl_sequence.h" #include "hl_sparse.ph" #include "paddle/utils/Logging.h" -#include "hl_device_functions.cuh" -#include "hl_gpu_matrix_kernel.cuh" DEFINE_MATRIX_UNARY_OP(Zero, a = 0); -DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b); -void hl_matrix_add(real *A_d, - real *B_d, - real *C_d, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b); +void hl_matrix_add(real* A_d, + real* B_d, + real* C_d, int dimM, int dimN, real alpha, @@ -36,33 +35,32 @@ void hl_matrix_add(real *A_d, CHECK_NOTNULL(B_d); CHECK_NOTNULL(C_d); - hl_gpu_apply_ternary_op - , 0, 0>(ternary::_add(alpha, beta), - A_d, - B_d, - C_d, - dimM, - dimN, - dimN, - dimN, - dimN); + hl_gpu_apply_ternary_op, 0, 0>( + ternary::_add(alpha, beta), + A_d, + B_d, + C_d, + dimM, + dimN, + dimN, + dimN, + dimN); CHECK_SYNC("hl_matrix_add failed"); } #ifdef PADDLE_TYPE_DOUBLE - #define THRESHOLD 128 +#define THRESHOLD 128 #else - #define THRESHOLD 64 +#define THRESHOLD 64 #endif -__device__ __forceinline__ -void findMax(real* I, - real* dfMax_s, - int blockSize, - int base, - int curIdx, - int nextIdx, - int dimN, - real* max) { +__device__ __forceinline__ void findMax(real* I, + real* dfMax_s, + int blockSize, + int base, + int curIdx, + int nextIdx, + int dimN, + real* max) { dfMax_s[base] = -1.0e20; while (curIdx < dimN) { if (dfMax_s[base] < I[nextIdx]) { @@ -78,25 +76,24 @@ void findMax(real* I, if (base < stride) { nextIdx = base + stride; if (dfMax_s[base] < dfMax_s[nextIdx]) { - dfMax_s[base] = dfMax_s[nextIdx]; + dfMax_s[base] = dfMax_s[nextIdx]; } } } - if (0 == base) { + if (0 == base) { max[0] = dfMax_s[0]; } __syncthreads(); } -__device__ __forceinline__ -void subMaxAndExp(real* I, - real* O, - int curIdx, - int nextIdx, - int blockSize, - int dimN, - real max) { +__device__ __forceinline__ void subMaxAndExp(real* I, + real* O, + int curIdx, + int nextIdx, + int blockSize, + int dimN, + real max) { real val; while (curIdx < dimN) { val = I[nextIdx] - max; @@ -115,14 +112,13 @@ void subMaxAndExp(real* I, __syncthreads(); } -__device__ __forceinline__ -void valueSum(real* O, - real* dfMax_s, - int blockSize, - int base, - int curIdx, - int nextIdx, - int dimN) { +__device__ __forceinline__ void valueSum(real* O, + real* dfMax_s, + int blockSize, + int base, + int curIdx, + int nextIdx, + int dimN) { dfMax_s[base] = 0; while (curIdx < dimN) { dfMax_s[base] += O[nextIdx]; @@ -141,13 +137,8 @@ void valueSum(real* O, __syncthreads(); } -__device__ __forceinline__ -void divSum(real* O, - real sum, - int curIdx, - int nextIdx, - int blockSize, - int dimN) { +__device__ __forceinline__ void divSum( + real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) { while (curIdx < dimN) { O[nextIdx] /= sum; nextIdx += blockSize; @@ -155,20 +146,18 @@ void divSum(real* O, } } -__device__ __forceinline__ -void softmax(real* I, - real* O, - real* dfMax_s, - int blockSize, - int base, - int curIdx, - int nextIdx, - int dimN) { +__device__ __forceinline__ void softmax(real* I, + real* O, + real* dfMax_s, + int blockSize, + int base, + int curIdx, + int nextIdx, + int dimN) { __shared__ real max; // find the max number - findMax(I, dfMax_s, blockSize, base, curIdx, - nextIdx, dimN, &max); + findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max); // sub max Value and do Exp operation subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max); @@ -181,8 +170,8 @@ void softmax(real* I, divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN); } -template -__global__ void KeMatrixSoftMax(real *O, real *I, int dimN) { +template +__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) { int base = threadIdx.x; __shared__ real dfMax_s[blockSize]; int nextIdx = blockIdx.x * dimN + base; @@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) { softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN); } -void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) { +void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); dim3 block(512, 1); dim3 grid(dimM, 1); - KeMatrixSoftMax<512> - <<>>(C_d, A_d, dimN); + KeMatrixSoftMax<512><<>>(C_d, A_d, dimN); CHECK_SYNC("hl_matrix_softmax failed"); } -template -__global__ void KeSequenceSoftMax(real *O, real *I, const int* index) { +template +__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) { int base = threadIdx.x; int bid = blockIdx.x; __shared__ real dfMax_s[blockSize]; @@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) { softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN); } -void hl_sequence_softmax_forward(real *A_d, - real *C_d, +void hl_sequence_softmax_forward(real* A_d, + real* C_d, const int* index, int numSequence) { CHECK_NOTNULL(A_d); @@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d, dim3 block(512, 1); dim3 grid(numSequence, 1); - KeSequenceSoftMax<512> - <<>>(C_d, A_d, index); + KeSequenceSoftMax<512><<>>(C_d, A_d, index); CHECK_SYNC("hl_sequence_softmax_forward failed"); } -__global__ void KeMatrixDerivative(real *grad_d, - real *output_d, - real *sftmaxSum_d, - int dimM, - int dimN) { - int rowIdx = blockIdx.x*blockDim.x + threadIdx.x; - int colIdx = blockIdx.y*blockDim.y + threadIdx.y; +__global__ void KeMatrixDerivative( + real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) { + int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; + int colIdx = blockIdx.y * blockDim.y + threadIdx.y; int index; if (rowIdx < dimM && colIdx < dimN) { - index = rowIdx*dimN + colIdx; + index = rowIdx * dimN + colIdx; grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]); } } -void hl_matrix_softmax_derivative(real *grad_d, - real *output_d, - real *sftmaxSum_d, - int dimM, - int dimN) { +void hl_matrix_softmax_derivative( + real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) { CHECK_NOTNULL(grad_d); CHECK_NOTNULL(output_d); CHECK_NOTNULL(sftmaxSum_d); int blocksX = (dimM + 0) / 1; - int blocksY = (dimN + 1024 -1) / 1024; + int blocksY = (dimN + 1024 - 1) / 1024; dim3 threads(1, 1024); dim3 grid(blocksX, blocksY); - KeMatrixDerivative<<< grid, threads, 0, STREAM_DEFAULT >>> - (grad_d, output_d, sftmaxSum_d, dimM, dimN); + KeMatrixDerivative<<>>( + grad_d, output_d, sftmaxSum_d, dimM, dimN); CHECK_SYNC("hl_matrix_softmax_derivative failed"); } -__global__ void KeMatrixMultiBinaryCrossEntropy(real* output, - real* entropy, - int* row, - int* col, - int dimM, - int dimN) { +__global__ void KeMatrixMultiBinaryCrossEntropy( + real* output, real* entropy, int* row, int* col, int dimM, int dimN) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < dimM) { - for (int i = 0; i < dimN; i ++) { + for (int i = 0; i < dimN; i++) { entropy[index] -= log(1 - output[index * dimN + i]); } - int *row_col = col + row[index]; + int* row_col = col + row[index]; int col_num = row[index + 1] - row[index]; - for (int i = 0; i < col_num; i ++) { + for (int i = 0; i < col_num; i++) { real o = output[index * dimN + row_col[i]]; entropy[index] -= log(o / (1 - o)); } @@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output, dim3 threads(n_threads); dim3 grid(blocks); hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix); - KeMatrixMultiBinaryCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>> - (output, entropy, mat->csr_row, mat->csr_col, dimM, dimN); + KeMatrixMultiBinaryCrossEntropy<<>>( + output, entropy, mat->csr_row, mat->csr_col, dimM, dimN); CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed"); } -__global__ void KeMatrixMultiBinaryCrossEntropyBp(real* output, - real* grad, - int* row, - int* col, - int dimM, - int dimN) { +__global__ void KeMatrixMultiBinaryCrossEntropyBp( + real* output, real* grad, int* row, int* col, int dimM, int dimN) { int row_idx = blockIdx.x * blockDim.x + threadIdx.x; if (row_idx < dimM) { - for (int i = 0; i < dimN; i ++) { + for (int i = 0; i < dimN; i++) { int index = row_idx * dimN + i; grad[index] += 1.0 / (1 - output[index]); } int col_num = row[row_idx + 1] - row[row_idx]; - int *row_col = col + row[row_idx]; - for (int i = 0; i < col_num; i ++) { + int* row_col = col + row[row_idx]; + for (int i = 0; i < col_num; i++) { int index = row_idx * dimN + row_col[i]; grad[index] -= 1.0 / (output[index] * (1 - output[index])); } } } -void hl_matrix_multi_binary_cross_entropy_bp(real* output, - real* grad, - hl_sparse_matrix_s csr_mat, - int dimM, - int dimN) { +void hl_matrix_multi_binary_cross_entropy_bp( + real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) { CHECK_NOTNULL(output); CHECK_NOTNULL(grad); CHECK_NOTNULL(csr_mat); @@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output, dim3 threads(n_threads); dim3 grid(blocks); hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix); - KeMatrixMultiBinaryCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>> - (output, grad, mat->csr_row, mat->csr_col, dimM, dimN); + KeMatrixMultiBinaryCrossEntropyBp<<>>( + output, grad, mat->csr_row, mat->csr_col, dimM, dimN); CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed"); } -__global__ void KeMatrixCrossEntropy(real* O, - real* E, - int* label, - int dimM, - int dimN) { +__global__ void KeMatrixCrossEntropy( + real* O, real* E, int* label, int dimM, int dimN) { int index = blockIdx.x * blockDim.x + threadIdx.x; int newBase; if (index < dimM) { @@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O, } } -void hl_matrix_cross_entropy(real* A_d, - real* C_d, - int* label_d, - int dimM, - int dimN) { +void hl_matrix_cross_entropy( + real* A_d, real* C_d, int* label_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); int blocks = (dimM + 1024 - 1) / 1024; dim3 threads(1024, 1); dim3 grid(blocks, 1); - KeMatrixCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>> - (A_d, C_d, label_d, dimM, dimN); + KeMatrixCrossEntropy<<>>( + A_d, C_d, label_d, dimM, dimN); CHECK_SYNC("hl_matrix_cross_entropy failed"); } -__global__ void KeMatrixCrossEntropyBp(real* grad_d, - real* output_d, - int* label_d, - int dimM, - int dimN) { - int rowIdx = blockIdx.x*blockDim.x + threadIdx.x; - int colIdx = blockIdx.y*blockDim.y + threadIdx.y; +__global__ void KeMatrixCrossEntropyBp( + real* grad_d, real* output_d, int* label_d, int dimM, int dimN) { + int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; + int colIdx = blockIdx.y * blockDim.y + threadIdx.y; int index; if (rowIdx < dimM && colIdx < dimN) { - index = rowIdx*dimN + colIdx; + index = rowIdx * dimN + colIdx; if (label_d[rowIdx] == colIdx) { grad_d[index] -= 1.0f / output_d[index]; } } } -void hl_matrix_cross_entropy_bp(real* grad_d, - real* output_d, - int* label_d, - int dimM, - int dimN) { +void hl_matrix_cross_entropy_bp( + real* grad_d, real* output_d, int* label_d, int dimM, int dimN) { CHECK_NOTNULL(grad_d); CHECK_NOTNULL(output_d); CHECK_NOTNULL(label_d); - int blocksX = (dimM + 0)/1; - int blocksY = (dimN + 1024 -1) / 1024; + int blocksX = (dimM + 0) / 1; + int blocksY = (dimN + 1024 - 1) / 1024; dim3 threads(1, 1024); dim3 grid(blocksX, blocksY); - KeMatrixCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>> - (grad_d, output_d, label_d, dimM, dimN); + KeMatrixCrossEntropyBp<<>>( + grad_d, output_d, label_d, dimM, dimN); CHECK_SYNC("hl_matrix_cross_entropy_bp failed"); } void hl_matrix_zero_mem(real* data, int num) { - hl_gpu_apply_unary_op( - unary::Zero(), data, 1, num, num); + hl_gpu_apply_unary_op(unary::Zero(), data, 1, num, num); } __global__ void KeParamReluForward(real* output, @@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output, int ty = blockIdx.y * blockDim.y + threadIdx.y; if (tx < width && ty < height) { int index = ty * width + tx; - output[index] = input[index] > 0 ? input[index] : - input[index] * w[tx / partial_sum]; + output[index] = + input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum]; } } @@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output, CHECK_NOTNULL(w); dim3 threads(16, 16); int blockX = (width + 16 - 1) / 16; - int blockY = (height + 16 -1) / 16; + int blockY = (height + 16 - 1) / 16; dim3 grid(blockX, blockY); - KeParamReluForward<<>> - (output, input, w, width, height, partial_sum); + KeParamReluForward<<>>( + output, input, w, width, height, partial_sum); CHECK_SYNC("hl_param_relu_forward failed"); } -template +template __global__ void KeParamReluBackWardW(real* grad_w, real* grad_o, real* input, @@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w, int grid_num = width / partial_sum; dim3 threads(blockSize, 1); dim3 grid(grid_num, 1); - KeParamReluBackWardW<<>> - (grad_w, grad_o, input, width, height, partial_sum); + KeParamReluBackWardW<<>>( + grad_w, grad_o, input, width, height, partial_sum); CHECK_SYNC("hl_param_relu_backward_w failed"); } @@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o, CHECK_NOTNULL(diff); dim3 threads(16, 16); int blockX = (width + 16 - 1) / 16; - int blockY = (height + 16 -1) / 16; + int blockY = (height + 16 - 1) / 16; dim3 grid(blockX, blockY); - KeParamReluBackwardDiff<<>> - (grad_o, data, w, diff, width, height, partial_sum); + KeParamReluBackwardDiff<<>>( + grad_o, data, w, diff, width, height, partial_sum); CHECK_SYNC("hl_param_relu_backward_diff failed"); } -__global__ void KeMatrixAddSharedBias(real* A, - real* B, - const int channel, - const int M, - const int N, - real scale) { +__global__ void KeMatrixAddSharedBias( + real* A, real* B, const int channel, const int M, const int N, real scale) { int index = blockIdx.x * blockDim.x + threadIdx.x; int dim = N / channel; if (index < M * N) { @@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d, real scale) { const int blocks = 512; const int grids = DIVUP(dimM * dimN, blocks); - KeMatrixAddSharedBias<<>> - (A_d, B_d, channel, dimM, dimN, scale); + KeMatrixAddSharedBias<<>>( + A_d, B_d, channel, dimM, dimN, scale); CHECK_SYNC("hl_matrix_add_shared_bias failed"); } - template -__global__ void KeMatrixCollectSharedBias(real *B, - real *A, +__global__ void KeMatrixCollectSharedBias(real* B, + real* A, const int channel, const int M, const int N, @@ -589,7 +541,7 @@ __global__ void KeMatrixCollectSharedBias(real *B, int n = j * blockSize + tid; int m = n / dim; int w = n % dim; - smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0; + smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0; __syncthreads(); simpleReduce(smem, tid, blockSize); sum += smem[0]; @@ -611,33 +563,32 @@ void hl_matrix_collect_shared_bias(real* B_d, const int limit = 64; int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel; - KeMatrixCollectSharedBias - <<< grids, blocks, 0, STREAM_DEFAULT>>> - (B_d, A_d, channel, dimM, dimN, dim, limit, scale); + KeMatrixCollectSharedBias<<>>( + B_d, A_d, channel, dimM, dimN, dim, limit, scale); CHECK_SYNC("hl_matrix_collect_shared_bias failed"); } -__global__ void keMatrixRotate(real* mat, real* matRot, - int dimM, int dimN, bool clockWise) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < dimM * dimN) { - int i = idx / dimN; - int j = idx % dimN; - if (clockWise) { - matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j]; - } else { - matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)]; - } +__global__ void keMatrixRotate( + real* mat, real* matRot, int dimM, int dimN, bool clockWise) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < dimM * dimN) { + int i = idx / dimN; + int j = idx % dimN; + if (clockWise) { + matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j]; + } else { + matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)]; } + } } -void hl_matrix_rotate(real *mat, real* matRot, - int dimM, int dimN, bool clockWise) { - CHECK_NOTNULL(mat); - CHECK_NOTNULL(matRot); - const int threads = 512; - const int blocks = DIVUP(dimM * dimN, threads); - keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>> - (mat, matRot, dimM, dimN, clockWise); - CHECK_SYNC("hl_matrix_rotate failed"); +void hl_matrix_rotate( + real* mat, real* matRot, int dimM, int dimN, bool clockWise) { + CHECK_NOTNULL(mat); + CHECK_NOTNULL(matRot); + const int threads = 512; + const int blocks = DIVUP(dimM * dimN, threads); + keMatrixRotate<<>>( + mat, matRot, dimM, dimN, clockWise); + CHECK_SYNC("hl_matrix_rotate failed"); } diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu index eeee921db54e20ea6a017d2b83f2d7ca9e5e037e..c52780dfcaff6e5b94d3568fac4ca011b76a1442 100644 --- a/paddle/cuda/src/hl_cuda_sequence.cu +++ b/paddle/cuda/src/hl_cuda_sequence.cu @@ -16,36 +16,36 @@ limitations under the License. */ #include "hl_device_functions.cuh" #include "paddle/utils/Logging.h" -__global__ void KeMaxSequenceForward(real *input, - const int *sequence, +__global__ void KeMaxSequenceForward(real* input, + const int* sequence, real* output, - int *index, + int* index, int numSequences, int dim) { int dimIdx = threadIdx.x; int sequenceId = blockIdx.x; if (sequenceId >= numSequences) return; int start = sequence[sequenceId]; - int end = sequence[sequenceId+1]; + int end = sequence[sequenceId + 1]; for (int i = dimIdx; i < dim; i += blockDim.x) { real tmp = -HL_FLOAT_MAX; int tmpId = -1; for (int insId = start; insId < end; insId++) { - if (tmp < input[insId*dim + i]) { - tmp = input[insId*dim + i]; + if (tmp < input[insId * dim + i]) { + tmp = input[insId * dim + i]; tmpId = insId; } } - output[sequenceId*dim + i] = tmp; - index[sequenceId*dim + i] = tmpId; + output[sequenceId * dim + i] = tmp; + index[sequenceId * dim + i] = tmpId; } } void hl_max_sequence_forward(real* input, const int* sequence, real* output, - int *index, + int* index, int numSequences, int dim) { CHECK_NOTNULL(input); @@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input, dim3 threads(256, 1); dim3 grid(numSequences, 1); - KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>> - (input, sequence, output, index, numSequences, dim); + KeMaxSequenceForward<<>>( + input, sequence, output, index, numSequences, dim); CHECK_SYNC("hl_max_sequence_forward failed"); } -__global__ void KeMaxSequenceBackward(real *outputGrad, - int *index, - real* inputGrad, - int numSequences, - int dim) { +__global__ void KeMaxSequenceBackward( + real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) { int idx = threadIdx.x + blockIdx.x * blockDim.x; int colIdx = idx % dim; - if (idx < numSequences*dim) { + if (idx < numSequences * dim) { int insId = index[idx]; inputGrad[insId * dim + colIdx] += outputGrad[idx]; } } -void hl_max_sequence_backward(real* outputGrad, - int *index, - real* inputGrad, - int numSequences, - int dim) { +void hl_max_sequence_backward( + real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) { CHECK_NOTNULL(outputGrad); CHECK_NOTNULL(index); CHECK_NOTNULL(inputGrad); @@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad, unsigned int blocks = (numSequences * dim + 128 - 1) / 128; dim3 threads(128, 1); dim3 grid(blocks, 1); - KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>> - (outputGrad, index, inputGrad, numSequences, dim); + KeMaxSequenceBackward<<>>( + outputGrad, index, inputGrad, numSequences, dim); CHECK_SYNC("hl_max_sequence_backward failed"); } -template +template __global__ void KeMatrixAddRows(real* output, real* table, int* ids, @@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output, while (sampleId < numSamples) { int tableId = ids[sampleId]; if ((0 <= tableId) && (tableId < tableSize)) { - real *outputData = output + sampleId * dim; - real *tableData = table + tableId * dim; + real* outputData = output + sampleId * dim; + real* tableData = table + tableId * dim; for (int i = idx; i < dim; i += blockDimX) { if (AddRow == 0) { outputData[i] += tableData[i]; @@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output, } } } - sampleId += blockDimY*gridDimX; + sampleId += blockDimY * gridDimX; } } -template -__global__ -void KeSequence2Batch(real *batch, - real *sequence, - const int *batchIndex, - int seqWidth, - int batchCount) { +template +__global__ void KeSequence2Batch(real* batch, + real* sequence, + const int* batchIndex, + int seqWidth, + int batchCount) { int idx = threadIdx.x; int idy = threadIdx.y; int id = blockIdx.x + idy * gridDimX; while (id < batchCount) { int seqId = batchIndex[id]; - real* batchData = batch + id*seqWidth; - real* seqData = sequence + seqId*seqWidth; + real* batchData = batch + id * seqWidth; + real* seqData = sequence + seqId * seqWidth; for (int i = idx; i < seqWidth; i += blockDimX) { if (seq2batch) { if (isAdd) { @@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch, } } } - id += blockDimY*gridDimX; + id += blockDimY * gridDimX; } } -void hl_sequence2batch_copy(real *batch, - real *sequence, - const int *batchIndex, +void hl_sequence2batch_copy(real* batch, + real* sequence, + const int* batchIndex, int seqWidth, int batchCount, bool seq2batch) { @@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch, dim3 threads(128, 8); dim3 grid(8, 1); if (seq2batch) { - KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>> - (batch, sequence, batchIndex, seqWidth, batchCount); + KeSequence2Batch<128, 8, 8, 1, 0><<>>( + batch, sequence, batchIndex, seqWidth, batchCount); } else { - KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>> - (batch, sequence, batchIndex, seqWidth, batchCount); + KeSequence2Batch<128, 8, 8, 0, 0><<>>( + batch, sequence, batchIndex, seqWidth, batchCount); } CHECK_SYNC("hl_sequence2batch_copy failed"); } -void hl_sequence2batch_add(real *batch, - real *sequence, - int *batchIndex, +void hl_sequence2batch_add(real* batch, + real* sequence, + int* batchIndex, int seqWidth, int batchCount, bool seq2batch) { @@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch, dim3 threads(128, 8); dim3 grid(8, 1); if (seq2batch) { - KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>> - (batch, sequence, batchIndex, seqWidth, batchCount); + KeSequence2Batch<128, 8, 8, 1, 1><<>>( + batch, sequence, batchIndex, seqWidth, batchCount); } else { - KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>> - (batch, sequence, batchIndex, seqWidth, batchCount); + KeSequence2Batch<128, 8, 8, 0, 1><<>>( + batch, sequence, batchIndex, seqWidth, batchCount); } CHECK_SYNC("hl_sequence2batch_add failed"); } -template -__global__ -void KeSequence2BatchPadding(real* batch, - real* sequence, - const int* sequenceStartPositions, - const size_t sequenceWidth, - const size_t maxSequenceLength, - const size_t numSequences) { +template +__global__ void KeSequence2BatchPadding(real* batch, + real* sequence, + const int* sequenceStartPositions, + const size_t sequenceWidth, + const size_t maxSequenceLength, + const size_t numSequences) { int batchIdx = blockIdx.y; int sequenceStart = sequenceStartPositions[batchIdx]; int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart; @@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch, if (seq2batch) { /* sequence -> batch */ if (normByTimes) { - KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( - batch, sequence, sequenceStartPositions, - sequenceWidth, maxSequenceLength, numSequences); + KeSequence2BatchPadding<1, 1><<>>( + batch, + sequence, + sequenceStartPositions, + sequenceWidth, + maxSequenceLength, + numSequences); } else { - KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( - batch, sequence, sequenceStartPositions, - sequenceWidth, maxSequenceLength, numSequences); + KeSequence2BatchPadding<0, 1><<>>( + batch, + sequence, + sequenceStartPositions, + sequenceWidth, + maxSequenceLength, + numSequences); } } else { /* batch -> sequence */ if (normByTimes) { - KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( - batch, sequence, sequenceStartPositions, - sequenceWidth, maxSequenceLength, numSequences); + KeSequence2BatchPadding<1, 0><<>>( + batch, + sequence, + sequenceStartPositions, + sequenceWidth, + maxSequenceLength, + numSequences); } else { - KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( - batch, sequence, sequenceStartPositions, - sequenceWidth, maxSequenceLength, numSequences); + KeSequence2BatchPadding<0, 0><<>>( + batch, + sequence, + sequenceStartPositions, + sequenceWidth, + maxSequenceLength, + numSequences); } } CHECK_SYNC("hl_sequence2batch_copy_padding failed"); } -__device__ inline float my_rsqrt(float x) { - return rsqrtf(x); -} +__device__ inline float my_rsqrt(float x) { return rsqrtf(x); } -__device__ inline double my_rsqrt(double x) { - return rsqrt(x); -} +__device__ inline double my_rsqrt(double x) { return rsqrt(x); } __global__ void KeSequenceAvgForward(real* dst, real* src, @@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst, for (int i = start; i < end; i++) { sum += src[i * width + col]; } - sum = mode == 1 ? sum : - (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength)); + sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength + : sum * my_rsqrt((real)seqLength)); dst[gid] += sum; } } @@ -347,10 +355,10 @@ void hl_sequence_avg_forward(real* dst, int grid = DIVUP(width * height, 512); CHECK(mode == 0 || mode == 1 || mode == 2) - << "mode error in hl_sequence_avg_forward!"; + << "mode error in hl_sequence_avg_forward!"; - KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>> - (dst, src, starts, height, width, mode); + KeSequenceAvgForward<<>>( + dst, src, starts, height, width, mode); CHECK_SYNC("hl_sequence_avg_forward failed"); } @@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst, int seqLength = end - start; if (seqLength == 0) return; real grad = src[gid]; - grad = mode == 1 ? grad : - (mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength)); + grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength + : grad * my_rsqrt((real)seqLength)); for (int i = start; i < end; i++) { dst[i * width + col] += grad; } @@ -392,9 +400,9 @@ void hl_sequence_avg_backward(real* dst, int grid = DIVUP(width * height, 512); CHECK(mode == 0 || mode == 1 || mode == 2) - << "mode error in hl_sequence_avg_backward!"; + << "mode error in hl_sequence_avg_backward!"; - KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>> - (dst, src, starts, height, width, mode); + KeSequenceAvgBackward<<>>( + dst, src, starts, height, width, mode); CHECK_SYNC("hl_sequence_avg_backward failed"); } diff --git a/paddle/cuda/src/hl_cuda_sparse.cu b/paddle/cuda/src/hl_cuda_sparse.cu index ab9ab57c884137f117c25c2752b5603b2e8b7135..6351e7e01ee55b6303a6e48bc9ebf9834a83130e 100644 --- a/paddle/cuda/src/hl_cuda_sparse.cu +++ b/paddle/cuda/src/hl_cuda_sparse.cu @@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "hl_cuda.h" +#include "hl_cuda_sparse.cuh" +#include "hl_matrix_apply.cuh" +#include "hl_matrix_ops.cuh" #include "hl_sparse.h" #include "hl_sparse.ph" -#include "hl_matrix_ops.cuh" -#include "hl_matrix_apply.cuh" -#include "hl_cuda_sparse.cuh" #include "paddle/utils/Logging.h" DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p); @@ -34,15 +33,15 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d, CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!"; if (A_d->nnz == 0) { - hl_gpu_apply_unary_op( - unary::Zero(), C_d, dimM, dimN, dimN); + hl_gpu_apply_unary_op(unary::Zero(), C_d, dimM, dimN, dimN); return; } /* nnz != 0 */ hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix); - CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && - A_d2->csr_row && A_d2->csr_col) << "parameter transa error!"; + CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && A_d2->csr_row && + A_d2->csr_col) + << "parameter transa error!"; int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; @@ -50,21 +49,11 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d, dim3 grid(blocksX, blocksY); if (A_d->type == HL_NO_VALUE) { - KeSMatrixCsr2Dense<0> - <<>>(A_d2->csr_val, - A_d2->csr_row, - A_d2->csr_col, - C_d, - dimM, - dimN); + KeSMatrixCsr2Dense<0><<>>( + A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN); } else if (A_d->type == HL_FLOAT_VALUE) { - KeSMatrixCsr2Dense<1> - <<>>(A_d2->csr_val, - A_d2->csr_row, - A_d2->csr_col, - C_d, - dimM, - dimN); + KeSMatrixCsr2Dense<1><<>>( + A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN); } else { } CHECK_SYNC("hl_matrix_csr2dense failed"); @@ -80,15 +69,15 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d, CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!"; if (A_d->nnz == 0) { - hl_gpu_apply_unary_op( - unary::Zero(), C_d, dimM, dimN, dimN); + hl_gpu_apply_unary_op(unary::Zero(), C_d, dimM, dimN, dimN); return; } /* nnz != 0 */ hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix); - CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && - A_d2->csc_row && A_d2->csc_col) << "parameter transa error!"; + CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && A_d2->csc_row && + A_d2->csc_col) + << "parameter transa error!"; int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; @@ -96,21 +85,11 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d, dim3 grid(blocksX, blocksY); if (A_d->type == HL_NO_VALUE) { - KeSMatrixCsc2Dense<0> - <<>>(A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - C_d, - dimM, - dimN); + KeSMatrixCsc2Dense<0><<>>( + A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN); } else if (A_d->type == HL_FLOAT_VALUE) { - KeSMatrixCsc2Dense<1> - <<>>(A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - C_d, - dimM, - dimN); + KeSMatrixCsc2Dense<1><<>>( + A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN); } else { } CHECK_SYNC("hl_matrix_csc2dense failed"); @@ -118,43 +97,43 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d, void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz) { CHECK_NOTNULL(A_d); CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC) - << "sparse matrix format error!"; + << "sparse matrix format error!"; CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE) - << "sparse matrix value type error!"; + << "sparse matrix value type error!"; /* avoid malloc 0 bytes */ int nnz_s = (nnz == 0 ? 1 : nnz); if (format == HL_SPARSE_CSR) { CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csr_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix)); CHECK_NOTNULL(tmp); - hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); + hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); csr->sparsity = -1.0; if (value_type == HL_NO_VALUE) { csr->csr_val = NULL; csr->nnz_s = nnz_s; - csr->row_s = dimM+1; - csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int)); - csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int)); + csr->row_s = dimM + 1; + csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int)); + csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int)); *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csr; } else if (value_type == HL_FLOAT_VALUE) { csr->nnz_s = nnz_s; - csr->row_s = dimM+1; - csr->csr_val = (real*)hl_malloc_device((nnz_s)*sizeof(real)); - csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int)); - csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int)); + csr->row_s = dimM + 1; + csr->csr_val = (real *)hl_malloc_device((nnz_s) * sizeof(real)); + csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int)); + csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int)); *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csr; @@ -162,28 +141,28 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, } else if (format == HL_SPARSE_CSC) { CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csc_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix)); CHECK_NOTNULL(tmp); - hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); + hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); csc->sparsity = -1.0f; if (value_type == HL_NO_VALUE) { csc->csc_val = NULL; csc->nnz_s = nnz_s; - csc->col_s = dimN+1; - csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int)); - csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int)); + csc->col_s = dimN + 1; + csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int)); + csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int)); *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csc; } else if (value_type == HL_FLOAT_VALUE) { csc->nnz_s = nnz_s; - csc->col_s = dimN+1; - csc->csc_val = (real*)hl_malloc_device((nnz_s)*sizeof(real)); - csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int)); - csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int)); + csc->col_s = dimN + 1; + csc->csc_val = (real *)hl_malloc_device((nnz_s) * sizeof(real)); + csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int)); + csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int)); *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csc; @@ -200,7 +179,7 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) { CHECK_NOTNULL(A_d); CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC) - << "sparse matrix format error!"; + << "sparse matrix format error!"; if (A_d->matrix == NULL) { free(A_d); @@ -249,77 +228,77 @@ void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) { } void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, - void * dest_d, + void *dest_d, size_t size, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz) { CHECK_NOTNULL(A_d); CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC) - << "sparse matrix format error!"; + << "sparse matrix format error!"; if (format == HL_SPARSE_CSR) { CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; - size_t size_ = (dimM+1)*sizeof(int) + nnz*sizeof(int); + size_t size_ = (dimM + 1) * sizeof(int) + nnz * sizeof(int); if (value_type != HL_NO_VALUE) { - size_ += nnz*sizeof(real); + size_ += nnz * sizeof(real); } CHECK_LE(size_, size) << "dest_d size(" << size - << ") too small, should bigger than(" << size_ << ")!"; + << ") too small, should bigger than(" << size_ + << ")!"; - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csr_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix)); CHECK_NOTNULL(tmp); - hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); + hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); if (value_type == HL_NO_VALUE) { csr->csr_val = NULL; - csr->csr_row = (int*)dest_d; - csr->csr_col = (int*)((char*)dest_d + (dimM+1)*sizeof(int)); + csr->csr_row = (int *)dest_d; + csr->csr_col = (int *)((char *)dest_d + (dimM + 1) * sizeof(int)); } else { - csr->csr_val = (real*)dest_d; - csr->csr_row = (int*)((char*)dest_d + nnz*sizeof(real)); - csr->csr_col = (int*)((char*)dest_d + - nnz*sizeof(real) + - (dimM+1)*sizeof(int)); + csr->csr_val = (real *)dest_d; + csr->csr_row = (int *)((char *)dest_d + nnz * sizeof(real)); + csr->csr_col = (int *)((char *)dest_d + nnz * sizeof(real) + + (dimM + 1) * sizeof(int)); } csr->nnz_s = nnz; - csr->row_s = dimM+1; + csr->row_s = dimM + 1; csr->sparsity = -1.0; *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csr; } else if (format == HL_SPARSE_CSC) { CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; - size_t size_ = (dimN+1)*sizeof(int) + nnz*sizeof(int); + size_t size_ = (dimN + 1) * sizeof(int) + nnz * sizeof(int); if (value_type != HL_NO_VALUE) { - size_ += nnz*sizeof(real); + size_ += nnz * sizeof(real); } CHECK_LE(size_, size) << "dest_d size(" << size - << ") too small, should bigger than(" << size_ << ")!"; + << ") too small, should bigger than(" << size_ + << ")!"; - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csc_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix)); CHECK_NOTNULL(tmp); - hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); + hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); if (value_type == HL_NO_VALUE) { csc->csc_val = NULL; - csc->csc_col = (int*)dest_d; - csc->csc_row = (int*)((char*)dest_d + (dimN+1)*sizeof(int)); + csc->csc_col = (int *)dest_d; + csc->csc_row = (int *)((char *)dest_d + (dimN + 1) * sizeof(int)); } else { - csc->csc_val = (real*)dest_d; - csc->csc_col = (int*)((char*)dest_d + nnz*sizeof(real)); - csc->csc_row = (int*)((char*)dest_d + - nnz*sizeof(real) + - (dimN+1)*sizeof(int)); + csc->csc_val = (real *)dest_d; + csc->csc_col = (int *)((char *)dest_d + nnz * sizeof(real)); + csc->csc_row = (int *)((char *)dest_d + nnz * sizeof(real) + + (dimN + 1) * sizeof(int)); } csc->nnz_s = nnz; - csc->col_s = dimN+1; + csc->col_s = dimN + 1; csc->sparsity = -1.0f; *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csc; @@ -333,11 +312,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, } void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, - real* value_d, - int* rows_d, - int* cols_d, + real *value_d, + int *rows_d, + int *cols_d, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz) { @@ -345,11 +324,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC) - << "sparse matrix format error!"; + << "sparse matrix format error!"; if (format == HL_SPARSE_CSR) { - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csr_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix)); CHECK_NOTNULL(tmp); hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); @@ -362,8 +341,8 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csr; } else if (format == HL_SPARSE_CSC) { - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csc_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix)); CHECK_NOTNULL(tmp); hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); @@ -396,35 +375,30 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix, hl_stream_t stream) { CHECK_NOTNULL(csr_matrix); CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR) - << "csr_matrix is not csr format!"; + << "csr_matrix is not csr format!"; CHECK_NOTNULL(csr_matrix->matrix); hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix); - CHECK_LE(csr_matrix->nnz, csr->nnz_s) - << "copy size " << csr_matrix->nnz - << " is big than alloc size " << csr->nnz_s; + CHECK_LE(csr_matrix->nnz, csr->nnz_s) << "copy size " << csr_matrix->nnz + << " is big than alloc size " + << csr->nnz_s; - CHECK_LE((csr_matrix->rows+1), csr->row_s) - << "copy size " << (csr_matrix->rows + 1) - << " is big than alloc size " << csr->row_s; + CHECK_LE((csr_matrix->rows + 1), csr->row_s) + << "copy size " << (csr_matrix->rows + 1) << " is big than alloc size " + << csr->row_s; - CHECK(csr_matrix->type == HL_FLOAT_VALUE || - csr_matrix->type == HL_NO_VALUE) - << "sparse matrix value type error!"; + CHECK(csr_matrix->type == HL_FLOAT_VALUE || csr_matrix->type == HL_NO_VALUE) + << "sparse matrix value type error!"; if (csr_matrix->type == HL_NO_VALUE) { if (csr_row == NULL && csr_col == NULL) { return; } else if (csr_row != NULL && csr_col != NULL) { - hl_memcpy_async(csr->csr_row, - csr_row, - (csr_matrix->rows+1)*sizeof(int), - stream); + hl_memcpy_async( + csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream); - hl_memcpy_async(csr->csr_col, - csr_col, - (csr_matrix->nnz)*sizeof(int), - stream); + hl_memcpy_async( + csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream); } else { LOG(FATAL) << "parameter csr_row or csr_col is null pointer!"; } @@ -432,30 +406,21 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix, if (csr_val == NULL && csr_row == NULL && csr_col == NULL) { return; } else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) { - hl_memcpy_async(csr->csr_val, - csr_val, - (csr_matrix->nnz)*sizeof(real), - stream); + hl_memcpy_async( + csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream); } else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) { - hl_memcpy_async(csr->csr_val, - csr_val, - (csr_matrix->nnz)*sizeof(real), - stream); - hl_memcpy_async(csr->csr_row, - csr_row, - (csr_matrix->rows+1)*sizeof(int), - stream); - hl_memcpy_async(csr->csr_col, - csr_col, - (csr_matrix->nnz)*sizeof(int), - stream); + hl_memcpy_async( + csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream); + hl_memcpy_async( + csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream); + hl_memcpy_async( + csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream); } else { LOG(FATAL) << "parameter csr_row or csr_col is null pointer!"; } } - csr->sparsity = ((float)csr_matrix->nnz) / - ((float)csr_matrix->rows) / + csr->sparsity = ((float)csr_matrix->nnz) / ((float)csr_matrix->rows) / ((float)csr_matrix->cols); } @@ -466,33 +431,28 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix, hl_stream_t stream) { CHECK_NOTNULL(csc_matrix); CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC) - << "csc_matrix is not csc format error!"; + << "csc_matrix is not csc format error!"; hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix); - CHECK_LE(csc_matrix->nnz, csc->nnz_s) - << "copy size " << csc_matrix->nnz - << " is big than alloc size " << csc->nnz_s; + CHECK_LE(csc_matrix->nnz, csc->nnz_s) << "copy size " << csc_matrix->nnz + << " is big than alloc size " + << csc->nnz_s; - CHECK_LE((csc_matrix->cols+1), csc->col_s) - << "copy size " <<(csc_matrix->cols + 1) - << " is big than alloc size " << csc->col_s; + CHECK_LE((csc_matrix->cols + 1), csc->col_s) + << "copy size " << (csc_matrix->cols + 1) << " is big than alloc size " + << csc->col_s; - CHECK(csc_matrix->type == HL_FLOAT_VALUE || - csc_matrix->type == HL_NO_VALUE) - << "sparse matrix value type error!"; + CHECK(csc_matrix->type == HL_FLOAT_VALUE || csc_matrix->type == HL_NO_VALUE) + << "sparse matrix value type error!"; if (csc_matrix->type == HL_NO_VALUE) { if (csc_row == NULL && csc_col == NULL) { return; } else if (csc_row != NULL && csc_col != NULL) { - hl_memcpy_async(csc->csc_row, - csc_row, - (csc_matrix->nnz)*sizeof(int), - stream); - hl_memcpy_async(csc->csc_col, - csc_col, - (csc_matrix->cols+1)*sizeof(int), - stream); + hl_memcpy_async( + csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream); + hl_memcpy_async( + csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream); } else { LOG(FATAL) << "parameter csc_row or csc_col is null pointer!"; } @@ -500,30 +460,21 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix, if (csc_val == NULL && csc_row == NULL && csc_col == NULL) { return; } else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) { - hl_memcpy_async(csc->csc_val, - csc_val, - (csc_matrix->nnz)*sizeof(real), - stream); + hl_memcpy_async( + csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream); } else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) { - hl_memcpy_async(csc->csc_val, - csc_val, - (csc_matrix->nnz)*sizeof(real), - stream); - hl_memcpy_async(csc->csc_row, - csc_row, - (csc_matrix->nnz)*sizeof(int), - stream); - hl_memcpy_async(csc->csc_col, - csc_col, - (csc_matrix->cols+1)*sizeof(int), - stream); + hl_memcpy_async( + csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream); + hl_memcpy_async( + csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream); + hl_memcpy_async( + csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream); } else { LOG(FATAL) << "parameter csc_row or csc_col is null pointer!"; } } - csc->sparsity = ((float)csc_matrix->nnz) / - ((float)csc_matrix->rows) / + csc->sparsity = ((float)csc_matrix->nnz) / ((float)csc_matrix->rows) / ((float)csc_matrix->cols); } @@ -531,32 +482,23 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst, hl_sparse_matrix_s src, hl_stream_t stream) { CHECK(dst && src && dst->matrix && src->matrix) - << "parameter dst or src is null pointer!"; - CHECK_EQ(dst->format, src->format) - << "sparse matrix format does not match!"; + << "parameter dst or src is null pointer!"; + CHECK_EQ(dst->format, src->format) << "sparse matrix format does not match!"; CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE) - << "src sparse matrix is no value, dst sparse matrix has value!"; + << "src sparse matrix is no value, dst sparse matrix has value!"; if (dst->format == HL_SPARSE_CSR) { dst->rows = src->rows; dst->cols = src->cols; - dst->nnz = src->nnz; + dst->nnz = src->nnz; hl_csr_matrix csr = (hl_csr_matrix)src->matrix; - hl_memcpy_csr_matrix(dst, - csr->csr_val, - csr->csr_row, - csr->csr_col, - stream); + hl_memcpy_csr_matrix(dst, csr->csr_val, csr->csr_row, csr->csr_col, stream); } else if (dst->format == HL_SPARSE_CSC) { dst->rows = src->rows; dst->cols = src->cols; - dst->nnz = src->nnz; + dst->nnz = src->nnz; hl_csc_matrix csc = (hl_csc_matrix)src->matrix; - hl_memcpy_csc_matrix(dst, - csc->csc_val, - csc->csc_row, - csc->csc_col, - stream); + hl_memcpy_csc_matrix(dst, csc->csc_val, csc->csc_row, csc->csc_col, stream); } else { LOG(FATAL) << "sparse matrix format error!"; } @@ -569,20 +511,24 @@ static void _beta_mul_c(real *c, int dimM, int dimN, real beta) { if (beta == 0.0) { hl_gpu_apply_unary_op(unary::Zero(), c, dimM, dimN, dimN); } else { - if (beta != 1.0){ - hl_gpu_apply_unary_op( - unary::mul_scalar(beta), c, dimM, dimN, dimN); + if (beta != 1.0) { + hl_gpu_apply_unary_op(unary::mul_scalar(beta), c, dimM, dimN, dimN); } } return; } -void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, +void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_EQ(transb, HPPL_OP_N); CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -592,7 +538,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) || (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) { - LOG(FATAL) << "parameter error!"; + LOG(FATAL) << "parameter error!"; } if (A_d->nnz == 0) { @@ -603,8 +549,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, /* nnz != 0 */ hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix); if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) || - A_d2->csr_row == NULL || - A_d2->csr_col == NULL) { + A_d2->csr_row == NULL || A_d2->csr_col == NULL) { LOG(FATAL) << "parameter error!"; } @@ -617,63 +562,63 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, /* sparsity pattern */ // A_d->sparsity; if (A_d->type == HL_NO_VALUE) { - KeSMatrixCsrMulDense<0> - <<>>(C_d, - A_d2->csr_val, - A_d2->csr_col, - A_d2->csr_row, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCsrMulDense<0><<>>( + C_d, + A_d2->csr_val, + A_d2->csr_col, + A_d2->csr_row, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixCsrMulDense<1> - <<>>(C_d, - A_d2->csr_val, - A_d2->csr_col, - A_d2->csr_row, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCsrMulDense<1><<>>( + C_d, + A_d2->csr_val, + A_d2->csr_col, + A_d2->csr_row, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } } else if (HPPL_OP_T == transa) { _beta_mul_c(C_d, dimM, dimN, beta); - int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / - CU_CSC_MUL_DENSE_BLOCK_N; - int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / - CU_CSC_MUL_DENSE_BLOCK_K; + int blocksX = + (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N; + int blocksY = + (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K; dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y); dim3 grid(blocksX, blocksY); if (A_d->type == HL_NO_VALUE) { - KeSMatrixCscMulDense<0> - <<>>(C_d, - A_d2->csr_val, - A_d2->csr_col, - A_d2->csr_row, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCscMulDense<0><<>>( + C_d, + A_d2->csr_val, + A_d2->csr_col, + A_d2->csr_row, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixCscMulDense<1> - <<>>(C_d, - A_d2->csr_val, - A_d2->csr_col, - A_d2->csr_row, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCscMulDense<1><<>>( + C_d, + A_d2->csr_val, + A_d2->csr_col, + A_d2->csr_row, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } } else { LOG(FATAL) << "parameter transa error!"; @@ -682,11 +627,16 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, CHECK_SYNC("hl_matrix_csr_mul_dense failed"); } -void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, - hl_sparse_matrix_s B_d, hl_trans_op_t transb, +void hl_matrix_dense_mul_csc(real *A_d, + hl_trans_op_t transa, + hl_sparse_matrix_s B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_EQ(transa, HPPL_OP_N); CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -698,8 +648,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, LOG(FATAL) << "parameter dims error!"; } - CHECK_EQ(B_d->format, HL_SPARSE_CSC) - << "matrix format error!"; + CHECK_EQ(B_d->format, HL_SPARSE_CSC) << "matrix format error!"; if (B_d->nnz == 0) { _beta_mul_c(C_d, dimM, dimN, beta); @@ -709,8 +658,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, /* nnz != 0 */ hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix); if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) || - B_d2->csc_row == NULL || - B_d2->csc_col == NULL) { + B_d2->csc_row == NULL || B_d2->csc_col == NULL) { LOG(FATAL) << "parameter B is null!"; } @@ -721,60 +669,60 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, dim3 grid(blocksX, blocksY); if (B_d->type == HL_NO_VALUE) { - KeSMatrixDenseMulCsc<0> - <<>>(C_d, - A_d, - B_d2->csc_val, - B_d2->csc_row, - B_d2->csc_col, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsc<0><<>>( + C_d, + A_d, + B_d2->csc_val, + B_d2->csc_row, + B_d2->csc_col, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixDenseMulCsc<1> - <<>>(C_d, - A_d, - B_d2->csc_val, - B_d2->csc_row, - B_d2->csc_col, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsc<1><<>>( + C_d, + A_d, + B_d2->csc_val, + B_d2->csc_row, + B_d2->csc_col, + dimM, + dimN, + dimK, + alpha, + beta); } } else if (transb == HPPL_OP_T) { _beta_mul_c(C_d, dimM, dimN, beta); - int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X; - int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M; + int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X; + int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M; dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y); dim3 grid(blocksX, blocksY); if (B_d->type == HL_NO_VALUE) { - KeSMatrixDenseMulCsr<0> - <<>>(C_d, - A_d, - B_d2->csc_val, - B_d2->csc_col, - B_d2->csc_row, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsr<0><<>>( + C_d, + A_d, + B_d2->csc_val, + B_d2->csc_col, + B_d2->csc_row, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixDenseMulCsr<1> - <<>>(C_d, - A_d, - B_d2->csc_val, - B_d2->csc_col, - B_d2->csc_row, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsr<1><<>>( + C_d, + A_d, + B_d2->csc_val, + B_d2->csc_col, + B_d2->csc_row, + dimM, + dimN, + dimK, + alpha, + beta); } } else { LOG(FATAL) << "parameter transb error!"; @@ -783,24 +731,28 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, CHECK_SYNC("hl_matrix_dense_mul_csc failed"); } -void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, - hl_sparse_matrix_s B_d, hl_trans_op_t transb, +void hl_matrix_dense_mul_csr(real *A_d, + hl_trans_op_t transa, + hl_sparse_matrix_s B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_EQ(transa, HPPL_OP_N); CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); CHECK_NOTNULL(C_d); - if (dimM <= 0 || dimN <= 0 || dimK <= 0 - || (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) - || (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) { + if (dimM <= 0 || dimN <= 0 || dimK <= 0 || + (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) || + (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) { LOG(FATAL) << "parameter dims error!"; } - CHECK_EQ(B_d->format, HL_SPARSE_CSR) - << "matrix format error!"; + CHECK_EQ(B_d->format, HL_SPARSE_CSR) << "matrix format error!"; if (B_d->nnz == 0) { _beta_mul_c(C_d, dimM, dimN, beta); @@ -810,41 +762,40 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, /* nnz != 0 */ hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix); if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) || - B_d2->csr_row == NULL || - B_d2->csr_col == NULL) { + B_d2->csr_row == NULL || B_d2->csr_col == NULL) { LOG(FATAL) << "parameter transa error!"; } if (transb == HPPL_OP_N) { _beta_mul_c(C_d, dimM, dimN, beta); - int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X; - int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M; + int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X; + int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M; dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y); dim3 grid(blocksX, blocksY); if (B_d->type == HL_NO_VALUE) { - KeSMatrixDenseMulCsr<0> - <<>>(C_d, - A_d, - B_d2->csr_val, - B_d2->csr_row, - B_d2->csr_col, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsr<0><<>>( + C_d, + A_d, + B_d2->csr_val, + B_d2->csr_row, + B_d2->csr_col, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixDenseMulCsr<1> - <<>>(C_d, - A_d, - B_d2->csr_val, - B_d2->csr_row, - B_d2->csr_col, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsr<1><<>>( + C_d, + A_d, + B_d2->csr_val, + B_d2->csr_row, + B_d2->csr_col, + dimM, + dimN, + dimK, + alpha, + beta); } } else if (transb == HPPL_OP_T) { int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST; @@ -852,29 +803,29 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST); dim3 grid(blocksX, blocksY); if (B_d->type == HL_NO_VALUE) { - KeSMatrixDenseMulCsc<0> - <<>>(C_d, - A_d, - B_d2->csr_val, - B_d2->csr_col, - B_d2->csr_row, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsc<0><<>>( + C_d, + A_d, + B_d2->csr_val, + B_d2->csr_col, + B_d2->csr_row, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixDenseMulCsc<1> - <<>>(C_d, - A_d, - B_d2->csr_val, - B_d2->csr_col, - B_d2->csr_row, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsc<1><<>>( + C_d, + A_d, + B_d2->csr_val, + B_d2->csr_col, + B_d2->csr_row, + dimM, + dimN, + dimK, + alpha, + beta); } } else { LOG(FATAL) << "parameter transb error!"; @@ -883,11 +834,16 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, CHECK_SYNC("hl_matrix_dense_mul_csr failed"); } -void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, +void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_EQ(transb, HPPL_OP_N); CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -908,42 +864,43 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, /* nnz != 0 */ hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix); if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) || - A_d2->csc_row == NULL || - A_d2->csc_col == NULL) { + A_d2->csc_row == NULL || A_d2->csc_col == NULL) { LOG(FATAL) << "parameter error!"; } if (HPPL_OP_N == transa) { _beta_mul_c(C_d, dimM, dimN, beta); - int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N -1)/CU_CSC_MUL_DENSE_BLOCK_N; - int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K -1)/CU_CSC_MUL_DENSE_BLOCK_K; + int blocksX = + (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N; + int blocksY = + (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K; dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y); dim3 grid(blocksX, blocksY); if (A_d->type == HL_NO_VALUE) { - KeSMatrixCscMulDense<0> - <<>>(C_d, - A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCscMulDense<0><<>>( + C_d, + A_d2->csc_val, + A_d2->csc_row, + A_d2->csc_col, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixCscMulDense<1> - <<>>(C_d, - A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCscMulDense<1><<>>( + C_d, + A_d2->csc_val, + A_d2->csc_row, + A_d2->csc_col, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } } else if (HPPL_OP_T == transa) { int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N; @@ -954,29 +911,29 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, /* sparsity pattern */ // A_d->sparsity; if (A_d->type == HL_NO_VALUE) { - KeSMatrixCsrMulDense<0> - <<>>(C_d, - A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCsrMulDense<0><<>>( + C_d, + A_d2->csc_val, + A_d2->csc_row, + A_d2->csc_col, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixCsrMulDense<1> - <<>>(C_d, - A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCsrMulDense<1><<>>( + C_d, + A_d2->csc_val, + A_d2->csc_row, + A_d2->csc_col, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } } else { LOG(FATAL) << "parameter transa error!"; @@ -985,11 +942,16 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, CHECK_SYNC("hl_matrix_csc_mul_dense failed"); } -void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, - hl_sparse_matrix_s C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { +void hl_sparse_matrix_mul(real *A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, + hl_sparse_matrix_s C_d, + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); CHECK_NOTNULL(C_d); @@ -1000,18 +962,14 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, if (C_d->format == HL_SPARSE_CSC) { hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix); - if (C_d2->csc_val == NULL || - C_d2->csc_row == NULL || + if (C_d2->csc_val == NULL || C_d2->csc_row == NULL || C_d2->csc_col == NULL) { LOG(FATAL) << "parameter error!"; } if (beta != 1.0) { - hl_gpu_apply_unary_op(unary::mul_scalar(beta), - C_d2->csc_val, - 1, - C_d->nnz, - C_d->nnz); + hl_gpu_apply_unary_op( + unary::mul_scalar(beta), C_d2->csc_val, 1, C_d->nnz, C_d->nnz); } int blocksX = dimN; @@ -1020,34 +978,30 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, dim3 grid(blocksX, blocksY); bool transA = transa == HPPL_OP_T ? 1 : 0; bool transB = transb == HPPL_OP_T ? 1 : 0; - KeSMatrixDenseMulDense2CSC - <<>>(C_d2->csc_val, - C_d2->csc_row, - C_d2->csc_col, - A_d, - B_d, - transA, - transB, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulDense2CSC<<>>( + C_d2->csc_val, + C_d2->csc_row, + C_d2->csc_col, + A_d, + B_d, + transA, + transB, + dimM, + dimN, + dimK, + alpha, + beta); CHECK_SYNC("hl_sparse_matrix_mul failed"); } else { hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix); if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) || - C_d2->csr_row == NULL || - C_d2->csr_col == NULL) { + C_d2->csr_row == NULL || C_d2->csr_col == NULL) { LOG(FATAL) << "parameter error!"; } if (beta != 1.0) { - hl_gpu_apply_unary_op(unary::mul_scalar(beta), - C_d2->csr_val, - 1, - C_d->nnz, - C_d->nnz); + hl_gpu_apply_unary_op( + unary::mul_scalar(beta), C_d2->csr_val, 1, C_d->nnz, C_d->nnz); } bool transA = transa == HPPL_OP_T ? 1 : 0; @@ -1058,20 +1012,20 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1); dim3 grid(blocksX, blocksY); - KeSMatrixDenseMulDense2CSR - <<>>(C_d2->csr_val, - C_d2->csr_row, - C_d2->csr_col, - A_d, - B_d, - transA, - transB, - dimM, - dimN, - dimK, - alpha, - beta); - CHECK_SYNC("hl_sparse_matrix_mul failed"); + KeSMatrixDenseMulDense2CSR<<>>( + C_d2->csr_val, + C_d2->csr_row, + C_d2->csr_col, + A_d, + B_d, + transA, + transB, + dimM, + dimN, + dimK, + alpha, + beta); + CHECK_SYNC("hl_sparse_matrix_mul failed"); } else { CHECK(!transA) << "Not supported A is trans and B is not trans!"; @@ -1080,21 +1034,21 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1; int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE); dim3 grid(gridx, dimM); - KeSMatrixDenseMulDenseTrans2CSR - <<>>(C_d2->csr_val, - C_d2->csr_row, - C_d2->csr_col, - A_d, - B_d, - transA, - transB, - dimM, - dimN, - dimK, - alpha, - beta); - CHECK_SYNC("hl_sparse_matrix_mul failed"); - } + KeSMatrixDenseMulDenseTrans2CSR<<>>( + C_d2->csr_val, + C_d2->csr_row, + C_d2->csr_col, + A_d, + B_d, + transA, + transB, + dimM, + dimN, + dimK, + alpha, + beta); + CHECK_SYNC("hl_sparse_matrix_mul failed"); + } } } @@ -1111,7 +1065,7 @@ void hl_memcpy_from_csc_matrix(real *csc_val, CHECK_NOTNULL(csc_col); CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC) - << "csc_matrix is not csc format error!"; + << "csc_matrix is not csc format error!"; if (csc_matrix->nnz > row_size || csc_matrix->cols + 1 > static_cast(col_size)) { @@ -1119,20 +1073,20 @@ void hl_memcpy_from_csc_matrix(real *csc_val, } hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix); - hl_memcpy_async((void*)csc_row, - (void*)csc->csc_row, + hl_memcpy_async((void *)csc_row, + (void *)csc->csc_row, (csc_matrix->nnz) * sizeof(int), stream); - hl_memcpy_async((void*)csc_col, - (void*)csc->csc_col, + hl_memcpy_async((void *)csc_col, + (void *)csc->csc_col, (csc_matrix->cols + 1) * sizeof(int), stream); if (csc_matrix->type == HL_FLOAT_VALUE) { if (csc_val != NULL) { CHECK_LE(csc_matrix->nnz, val_size) << "size not match!"; - hl_memcpy_async((void*)csc_val, - (void*)csc->csc_val, - (csc_matrix->nnz)*sizeof(real), + hl_memcpy_async((void *)csc_val, + (void *)csc->csc_val, + (csc_matrix->nnz) * sizeof(real), stream); } else { LOG(FATAL) << "parameter csr_val is null pointer!"; @@ -1152,7 +1106,7 @@ void hl_memcpy_from_csr_matrix(real *csr_val, CHECK_NOTNULL(csr_row); CHECK_NOTNULL(csr_col); CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR) - << "csr_matrix is not csr format error!"; + << "csr_matrix is not csr format error!"; if (csr_matrix->nnz > col_size || csr_matrix->rows + 1 > static_cast(row_size)) { @@ -1160,20 +1114,20 @@ void hl_memcpy_from_csr_matrix(real *csr_val, } hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix); - hl_memcpy_async((void*)csr_row, - (void*)csr->csr_row, - (csr_matrix->rows+1)*sizeof(int), + hl_memcpy_async((void *)csr_row, + (void *)csr->csr_row, + (csr_matrix->rows + 1) * sizeof(int), stream); - hl_memcpy_async((void*)csr_col, - (void*)csr->csr_col, - (csr_matrix->nnz)*sizeof(int), + hl_memcpy_async((void *)csr_col, + (void *)csr->csr_col, + (csr_matrix->nnz) * sizeof(int), stream); if (csr_matrix->type == HL_FLOAT_VALUE) { if (csr_val != NULL) { CHECK_LE(csr_matrix->nnz, val_size) << "size not match!"; - hl_memcpy_async((void*)csr_val, - (void*)csr->csr_val, - (csr_matrix->nnz)*sizeof(real), + hl_memcpy_async((void *)csr_val, + (void *)csr->csr_val, + (csr_matrix->nnz) * sizeof(real), stream); } else { LOG(FATAL) << "parameter csr_val is null pointer!"; @@ -1181,8 +1135,8 @@ void hl_memcpy_from_csr_matrix(real *csr_val, } } -void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM, - int dimN, real scale) { +void hl_sparse_matrix_column_sum( + real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) { if (B_d->format == HL_SPARSE_CSR) { hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale); } else { @@ -1190,8 +1144,8 @@ void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM, } } -void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d, - int dimM, int dimN, real scale) { +void hl_matrix_csr_column_sum( + real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -1216,8 +1170,7 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d, CHECK_SYNC("hl_matrix_csr_column_sum failed"); } -void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, - real* B_d, real scale) { +void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) { if (A_d->format == HL_SPARSE_CSR) { hl_matrix_csr_add_bias(A_d, B_d, scale); } else { @@ -1225,8 +1178,7 @@ void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, } } -void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d, - real scale) { +void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -1247,8 +1199,12 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d, CHECK_SYNC("hl_sparse_matrix_add_bias failed"); } -void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM, - int dimN, real alpha, real beta) { +void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, + real *B_d, + int dimM, + int dimN, + real alpha, + real beta) { if (A_d->format == HL_SPARSE_CSR) { hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta); } else { @@ -1256,8 +1212,12 @@ void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM, } } -void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM, - int dimN, real alpha, real beta) { +void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, + real *B_d, + int dimM, + int dimN, + real alpha, + real beta) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -1277,20 +1237,26 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM, gridX = gridX > 0 ? gridX : 1; dim3 block(512, 1); dim3 grid(gridX, dimM); - KeSMatrixCsrAddDense<<>>( - A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, B_d, alpha, beta, dimM, dimN); + KeSMatrixCsrAddDense<<>>(A_d2->csr_val, + A_d2->csr_row, + A_d2->csr_col, + B_d, + alpha, + beta, + dimM, + dimN); CHECK_SYNC("hl_sparse_matrix_add_dense failed"); } -int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { +int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { __sparse_get_return__(sMat, row); } -int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { +int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { __sparse_get_return__(sMat, col); } -real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) { +real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) { __sparse_get_return__(sMat, val); } diff --git a/paddle/cuda/src/hl_perturbation_util.cu b/paddle/cuda/src/hl_perturbation_util.cu index 2a945bcdb87fe49c121890128ef77b084ebe8e60..d01a91561efa2ebe8e0cabc2b4e8885f2c02ab48 100644 --- a/paddle/cuda/src/hl_perturbation_util.cu +++ b/paddle/cuda/src/hl_perturbation_util.cu @@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#include #include -#include "hl_cuda.h" -#include "hl_time.h" +#include #include "hl_base.h" +#include "hl_cuda.h" #include "hl_perturbation_util.cuh" +#include "hl_time.h" #define _USE_MATH_DEFINES @@ -30,10 +29,16 @@ limitations under the License. */ * centerX, centerY: translation. * sourceX, sourceY: output coordinates in the original image. */ -__device__ void getTranformCoord(int x, int y, real theta, real scale, - real tgtCenter, real imgCenter, - real centerR, real centerC, - int* sourceX, int* sourceY) { +__device__ void getTranformCoord(int x, + int y, + real theta, + real scale, + real tgtCenter, + real imgCenter, + real centerR, + real centerC, + int* sourceX, + int* sourceY) { real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)}; // compute coornidates in the rotated and scaled image @@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale, * created by Wei Xu (genome), converted by Jiang Wang */ -__global__ void kSamplingPatches(const real* imgs, real* targets, - int imgSize, int tgtSize, const int channels, - int samplingRate, const real* thetas, - const real* scales, const int* centerRs, - const int* centerCs, const real padValue, +__global__ void kSamplingPatches(const real* imgs, + real* targets, + int imgSize, + int tgtSize, + const int channels, + int samplingRate, + const real* thetas, + const real* scales, + const int* centerRs, + const int* centerCs, + const real padValue, const int numImages) { const int caseIdx = blockIdx.x * 4 + threadIdx.x; const int pxIdx = blockIdx.y * 128 + threadIdx.y; @@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, const int pxY = pxIdx / tgtSize; int srcPxX, srcPxY; - getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter, - imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX, + getTranformCoord(pxX, + pxY, + thetas[imgIdx], + scales[imgIdx], + tgtCenter, + imgCenter, + centerCs[caseIdx], + centerRs[caseIdx], + &srcPxX, &srcPxY); imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels; @@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, * * created by Wei Xu */ -void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, - int*& gpuCenterR, int*& gpuCenterC, - int numImages, int imgSize, real rotateAngle, - real scaleRatio, int samplingRate, +void hl_generate_disturb_params(real*& gpuAngle, + real*& gpuScaleRatio, + int*& gpuCenterR, + int*& gpuCenterC, + int numImages, + int imgSize, + real rotateAngle, + real scaleRatio, + int samplingRate, bool isTrain) { // The number of output samples. int numPatches = numImages * samplingRate; @@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, for (int i = 0; i < numImages; i++) { r_angle[i] = (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT - - 0.5); + - + 0.5); s_ratio[i] = 1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT } @@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, int pxY = (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT - const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]), - sin(-r_angle[i]), cos(-r_angle[i])}; + const real H[4] = {cos(-r_angle[i]), + -sin(-r_angle[i]), + sin(-r_angle[i]), + cos(-r_angle[i])}; real x = pxX - imgCenter; real y = pxY - imgCenter; real xx = H[0] * x + H[1] * y; @@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, delete[] center_c; } -void hl_conv_random_disturb_with_params(const real* images, int imgSize, - int tgtSize, int channels, - int numImages, int samplingRate, +void hl_conv_random_disturb_with_params(const real* images, + int imgSize, + int tgtSize, + int channels, + int numImages, + int samplingRate, const real* gpuRotationAngle, const real* gpuScaleRatio, const int* gpuCenterR, @@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize, dim3 threadsPerBlock(4, 128); dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128)); - kSamplingPatches <<>> - (images, target, imgSize, tgtSize, channels, samplingRate, - gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, - paddingValue, numImages); + kSamplingPatches<<>>(images, + target, + imgSize, + tgtSize, + channels, + samplingRate, + gpuRotationAngle, + gpuScaleRatio, + gpuCenterR, + gpuCenterC, + paddingValue, + numImages); hl_device_synchronize(); } -void hl_conv_random_disturb(const real* images, int imgSize, - int tgtSize, int channels, int numImages, - real scaleRatio, real rotateAngle, - int samplingRate, real* gpu_r_angle, - real* gpu_s_ratio, int* gpu_center_r, - int* gpu_center_c, int paddingValue, - bool isTrain, real* targets) { +void hl_conv_random_disturb(const real* images, + int imgSize, + int tgtSize, + int channels, + int numImages, + real scaleRatio, + real rotateAngle, + int samplingRate, + real* gpu_r_angle, + real* gpu_s_ratio, + int* gpu_center_r, + int* gpu_center_c, + int paddingValue, + bool isTrain, + real* targets) { // generate the random disturbance sequence and the sampling locations - hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r, - gpu_center_c, numImages, imgSize, rotateAngle, - scaleRatio, samplingRate, isTrain); - - hl_conv_random_disturb_with_params( - images, imgSize, tgtSize, channels, numImages, - samplingRate, gpu_r_angle, gpu_s_ratio, - gpu_center_r, gpu_center_r, paddingValue, - targets); + hl_generate_disturb_params(gpu_r_angle, + gpu_s_ratio, + gpu_center_r, + gpu_center_c, + numImages, + imgSize, + rotateAngle, + scaleRatio, + samplingRate, + isTrain); + + hl_conv_random_disturb_with_params(images, + imgSize, + tgtSize, + channels, + numImages, + samplingRate, + gpu_r_angle, + gpu_s_ratio, + gpu_center_r, + gpu_center_r, + paddingValue, + targets); } diff --git a/paddle/cuda/src/hl_table_apply.cu b/paddle/cuda/src/hl_table_apply.cu index 61edbe3ccc7028fd8779c4119f33c4cb5afe0564..d3b71c75e6e69d48c8d98041e3d6075aa8d53610 100644 --- a/paddle/cuda/src/hl_table_apply.cu +++ b/paddle/cuda/src/hl_table_apply.cu @@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "hl_base.h" -#include "hl_device_functions.cuh" #include "hl_cuda.h" +#include "hl_device_functions.cuh" #include "paddle/utils/Logging.h" -template -__global__ void KeMatrixAddRows(real* output, int ldo, - real* table, int ldt, +template +__global__ void KeMatrixAddRows(real* output, + int ldo, + real* table, + int ldt, int* ids, int numSamples, int tableSize, @@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo, while (idy < numSamples) { int tableId = ids[idy]; if ((0 <= tableId) && (tableId < tableSize)) { - real *out = output + idy * ldo; - real *tab = table + tableId * ldt; + real* out = output + idy * ldo; + real* tab = table + tableId * ldt; for (int i = idx; i < dim; i += blockDimX) { if (AddRow) { paddle::paddleAtomicAdd(&tab[i], out[i]); @@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo, } } -void hl_matrix_select_rows(real* output, int ldo, - real* table, int ldt, +void hl_matrix_select_rows(real* output, + int ldo, + real* table, + int ldt, int* ids, int numSamples, int tableSize, @@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo, dim3 threads(128, 8); dim3 grid(8, 1); - KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>> - (output, ldo, table, ldt, ids, numSamples, tableSize, dim); + KeMatrixAddRows<128, 8, 8, 0><<>>( + output, ldo, table, ldt, ids, numSamples, tableSize, dim); CHECK_SYNC("hl_matrix_select_rows failed"); } -void hl_matrix_add_to_rows(real* table, int ldt, - real* input, int ldi, +void hl_matrix_add_to_rows(real* table, + int ldt, + real* input, + int ldi, int* ids, int numSamples, int tableSize, @@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt, dim3 threads(128, 8); dim3 grid(8, 1); - KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>> - (input, ldi, table, ldt, ids, numSamples, tableSize, dim); + KeMatrixAddRows<128, 8, 8, 1><<>>( + input, ldi, table, ldt, ids, numSamples, tableSize, dim); CHECK_SYNC("hl_matrix_add_to_rows failed"); } -template -__global__ void KeVectorSelect(T* dst, int sized, - const T* src, int sizes, - const int* ids, int sizei) { +template +__global__ void KeVectorSelect( + T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) { int idx = threadIdx.x + blockDimX * blockIdx.x; while (idx < sizei) { int index = ids[idx]; @@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized, } template -void hl_vector_select_from(T* dst, int sized, - const T* src, int sizes, - const int* ids, int sizei) { +void hl_vector_select_from( + T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) { CHECK_NOTNULL(dst); CHECK_NOTNULL(src); CHECK_NOTNULL(ids); @@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized, dim3 threads(512, 1); dim3 grid(8, 1); - KeVectorSelect<<< grid, threads, 0, STREAM_DEFAULT >>> - (dst, sized, src, sizes, ids, sizei); + KeVectorSelect<<>>( + dst, sized, src, sizes, ids, sizei); CHECK_SYNC("hl_vector_select_from failed"); } -template -void hl_vector_select_from(real* dst, int sized, - const real* src, int sizes, - const int* ids, int sizei); -template -void hl_vector_select_from(int* dst, int sized, - const int* src, int sizes, - const int* ids, int sizei); - +template void hl_vector_select_from(real* dst, + int sized, + const real* src, + int sizes, + const int* ids, + int sizei); +template void hl_vector_select_from( + int* dst, int sized, const int* src, int sizes, const int* ids, int sizei); diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu index 4f0bbfcf4e3aa51dd06acf254af65c62098a1df7..1896a56634c3a75e5a2a1e08661088b263f8ee10 100644 --- a/paddle/cuda/src/hl_top_k.cu +++ b/paddle/cuda/src/hl_top_k.cu @@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "hl_base.h" -#include "hl_top_k.h" #include "hl_sparse.ph" +#include "hl_top_k.h" #include "paddle/utils/Logging.h" // using namespace hppl; struct Pair { - __device__ __forceinline__ - Pair() {} + __device__ __forceinline__ Pair() {} - __device__ __forceinline__ - Pair(real value, int id) : v_(value), id_(id) {} + __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {} - __device__ __forceinline__ - void set(real value, int id) { + __device__ __forceinline__ void set(real value, int id) { v_ = value; id_ = id; } - __device__ __forceinline__ - void operator=(const Pair& in) { + __device__ __forceinline__ void operator=(const Pair& in) { v_ = in.v_; id_ = in.id_; } - __device__ __forceinline__ - bool operator<(const real value) const { + __device__ __forceinline__ bool operator<(const real value) const { return (v_ < value); } - __device__ __forceinline__ - bool operator<(const Pair& in) const { + __device__ __forceinline__ bool operator<(const Pair& in) const { return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_)); } - __device__ __forceinline__ - bool operator>(const Pair& in) const { + __device__ __forceinline__ bool operator>(const Pair& in) const { return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_)); } @@ -58,8 +50,9 @@ struct Pair { int id_; }; -__device__ __forceinline__ -void addTo(Pair topK[], const Pair &p, int beamSize) { +__device__ __forceinline__ void addTo(Pair topK[], + const Pair& p, + int beamSize) { for (int k = beamSize - 2; k >= 0; k--) { if (topK[k] < p) { topK[k + 1] = topK[k]; @@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) { topK[0] = p; } -template -__device__ __forceinline__ -void addTo(Pair topK[], const Pair &p) { +template +__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) { for (int k = beamSize - 2; k >= 0; k--) { if (topK[k] < p) { topK[k + 1] = topK[k]; @@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) { topK[0] = p; } -template -__device__ __forceinline__ -void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) { +template +__device__ __forceinline__ void getTopK( + Pair topK[], real* src, int idx, int dim, int beamSize) { while (idx < dim) { if (topK[beamSize - 1] < src[idx]) { Pair tmp(src[idx], idx); @@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) { } } -template -__device__ __forceinline__ -void getTopK(Pair topK[], real *src, int idx, int dim, - const Pair& max, int beamSize) { +template +__device__ __forceinline__ void getTopK( + Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) { while (idx < dim) { if (topK[beamSize - 1] < src[idx]) { Pair tmp(src[idx], idx); @@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, } } -template -__device__ __forceinline__ -void getTopK(Pair topK[], real *val, int *col, - int idx, int dim, int beamSize) { +template +__device__ __forceinline__ void getTopK( + Pair topK[], real* val, int* col, int idx, int dim, int beamSize) { while (idx < dim) { if (topK[beamSize - 1] < val[idx]) { Pair tmp(val[idx], col[idx]); @@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col, } } -template -__device__ __forceinline__ -void getTopK(Pair topK[], real *val, int *col, int idx, int dim, - const Pair& max, int beamSize) { +template +__device__ __forceinline__ void getTopK(Pair topK[], + real* val, + int* col, + int idx, + int dim, + const Pair& max, + int beamSize) { while (idx < dim) { if (topK[beamSize - 1] < val[idx]) { Pair tmp(val[idx], col[idx]); @@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim, } } -template -__device__ __forceinline__ -void threadGetTopK(Pair topK[], int& beam, int beamSize, - real* src, - bool& firstStep, bool& isEmpty, Pair& max, - int dim, const int tid) { +template +__device__ __forceinline__ void threadGetTopK(Pair topK[], + int& beam, + int beamSize, + real* src, + bool& firstStep, + bool& isEmpty, + Pair& max, + int dim, + const int tid) { if (beam > 0) { int length = beam < beamSize ? beam : beamSize; if (firstStep) { @@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, } } if (!isEmpty) { - getTopK(topK + maxLength - beam, src, tid, dim, - max, length); + getTopK(topK + maxLength - beam, src, tid, dim, max, length); } } @@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, } } -template -__device__ __forceinline__ -void threadGetTopK(Pair topK[], int& beam, int beamSize, - real* val, int* col, - bool& firstStep, bool& isEmpty, Pair& max, - int dim, const int tid) { +template +__device__ __forceinline__ void threadGetTopK(Pair topK[], + int& beam, + int beamSize, + real* val, + int* col, + bool& firstStep, + bool& isEmpty, + Pair& max, + int dim, + const int tid) { if (beam > 0) { int length = beam < beamSize ? beam : beamSize; if (firstStep) { @@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, } } if (!isEmpty) { - getTopK(topK + maxLength - beam, val, col, tid, dim, - max, length); + getTopK( + topK + maxLength - beam, val, col, tid, dim, max, length); } } @@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, } } -template -__device__ __forceinline__ -void blockReduce(Pair* shTopK, int* maxId, Pair topK[], - real** topVal, int** topIds, - int& beam, int& beamSize, - const int tid, const int warp) { +template +__device__ __forceinline__ void blockReduce(Pair* shTopK, + int* maxId, + Pair topK[], + real** topVal, + int** topIds, + int& beam, + int& beamSize, + const int tid, + const int warp) { while (true) { __syncthreads(); if (tid < blockSize / 2) { @@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[], } } __syncthreads(); - for (int stride = blockSize / 4; stride > 0; stride = stride/2) { + for (int stride = blockSize / 4; stride > 0; stride = stride / 2) { if (tid < stride) { if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) { maxId[tid] = maxId[tid + stride]; @@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[], * 3. go to the second setp, until one thread's topK value is null; * 4. go to the first setp, until get the topK value. */ -template -__global__ void KeMatrixTopK(real* topVal, int ldv, - int * topIds, - real* src, int lds, +template +__global__ void KeMatrixTopK(real* topVal, + int ldv, + int* topIds, + real* src, + int lds, int dim, int beamSize) { __shared__ Pair shTopK[blockSize]; @@ -271,7 +279,7 @@ __global__ void KeMatrixTopK(real* topVal, int ldv, topVal += blockIdx.x * ldv; topIds += blockIdx.x * beamSize; - Pair topK[maxLength]; // NOLINT + Pair topK[maxLength]; // NOLINT int beam = maxLength; Pair max; bool isEmpty = false; @@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv, topK[k].set(-HL_FLOAT_MAX, -1); } while (beamSize) { - threadGetTopK - (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); + threadGetTopK( + topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); shTopK[tid] = topK[0]; - blockReduce - (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); + blockReduce( + shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); } } -template -__global__ void KeSMatrixTopK(real* topVal, int ldv, - int * topIds, +template +__global__ void KeSMatrixTopK(real* topVal, + int ldv, + int* topIds, real* val, int* row, int* col, @@ -304,7 +313,7 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv, topVal += blockIdx.x * ldv; topIds += blockIdx.x * beamSize; - Pair topK[maxLength]; // NOLINT + Pair topK[maxLength]; // NOLINT int beam = maxLength; Pair max; bool isEmpty = false; @@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv, topK[k].set(-HL_FLOAT_MAX, -1); } while (beamSize) { - threadGetTopK - (topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid); + threadGetTopK( + topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid); shTopK[tid] = topK[0]; - blockReduce - (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); + blockReduce( + shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); } } -void hl_matrix_top_k(real* topVal, int ldv, - int * topIds, - real* src, int lds, +void hl_matrix_top_k(real* topVal, + int ldv, + int* topIds, + real* src, + int lds, int dim, int beamSize, int numSamples) { @@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv, dim3 threads(256, 1); dim3 grid(numSamples, 1); - KeMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>> - (topVal, ldv, topIds, src, lds, dim, beamSize); + KeMatrixTopK<5, 256><<>>( + topVal, ldv, topIds, src, lds, dim, beamSize); CHECK_SYNC("hl_matrix_top_k failed"); } -void hl_sparse_matrix_top_k(real* topVal, int ldv, - int * topIds, +void hl_sparse_matrix_top_k(real* topVal, + int ldv, + int* topIds, hl_sparse_matrix_s src, int beamSize, int numSamples) { CHECK_NOTNULL(topVal); CHECK_NOTNULL(topIds); CHECK_NOTNULL(src); - CHECK_EQ(src->format, HL_SPARSE_CSR) - <<"sparse matrix format error!"; + CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!"; hl_csr_matrix csr = (hl_csr_matrix)src->matrix; - if (csr->csr_val == NULL || csr->csr_row == NULL || - csr->csr_col == NULL) { + if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) { LOG(FATAL) << "parameter src is null!"; } dim3 threads(256, 1); dim3 grid(numSamples, 1); - KeSMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>> - (topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize); + KeSMatrixTopK<5, 256><<>>( + topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize); CHECK_SYNC("hl_sparse_matrix_top_k failed"); } @@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv, * 3. go to the second setp, until one thread's topK value is null; * 4. go to the first setp, until get the topK value. */ -template -__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, - int * topIds, - real* src, int lds, +template +__global__ void KeMatrixTopKClassificationError(real* topVal, + int ldv, + int* topIds, + real* src, + int lds, int dim, int beamSize, int* label, @@ -408,7 +420,7 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, topVal += blockIdx.x * ldv; topIds += blockIdx.x * beamSize; - Pair topK[maxLength]; // NOLINT + Pair topK[maxLength]; // NOLINT int beam = maxLength; Pair max; bool isEmpty = false; @@ -420,34 +432,36 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, } while (beamSize) { - threadGetTopK - (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); + threadGetTopK( + topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); shTopK[tid] = topK[0]; - blockReduce - (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); + blockReduce( + shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); } __syncthreads(); if (tid == 0) { for (int i = 0; i < topkSize; i++) { - if (*--topIds == label[blockIdx.x]) { - recResult[blockIdx.x] = 0; - break; - } - recResult[blockIdx.x] = 1.0f; + if (*--topIds == label[blockIdx.x]) { + recResult[blockIdx.x] = 0; + break; + } + recResult[blockIdx.x] = 1.0f; } } } -void hl_matrix_classification_error(real* topVal, int ldv, - int* topIds, - real* src, int lds, - int dim, - int topkSize, - int numSamples, - int* label, - real* recResult) { +void hl_matrix_classification_error(real* topVal, + int ldv, + int* topIds, + real* src, + int lds, + int dim, + int topkSize, + int numSamples, + int* label, + real* recResult) { CHECK_NOTNULL(topVal); CHECK_NOTNULL(topIds); CHECK_NOTNULL(src); @@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv, dim3 threads(256, 1); dim3 grid(numSamples, 1); - KeMatrixTopKClassificationError<5, 256> - <<< grid, threads, 0, STREAM_DEFAULT >>> - (topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult); + KeMatrixTopKClassificationError<5, 256><<>>( + topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult); CHECK_SYNC("hl_matrix_top_k classification error failed"); } diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 12a3a00bba35d476fca9c9fb47ac20b87e6f53f2..33e6baf818a728d7bf50ba110274d60000dcc22e 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -12,13 +12,15 @@ cc_test(variable_test SRCS variable_test.cc) cc_library(scope SRCS scope.cc) cc_test(scope_test SRCS scope_test.cc DEPS scope) -proto_library(attr_type SRCS attr_type.proto) -proto_library(op_proto SRCS op_proto.proto DEPS attr_type) -proto_library(op_desc SRCS op_desc.proto DEPS attr_type) +proto_library(attribute_proto SRCS attribute.proto) +proto_library(op_proto SRCS op_proto.proto DEPS attribute_proto) +proto_library(op_desc SRCS op_desc.proto DEPS attribute_proto) cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf) cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf) -cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope) +cc_library(attribute SRCS attribute.cc DEPS op_desc op_proto) + +cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope attribute) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry) cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator) @@ -26,13 +28,24 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder) cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op) -py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto) +py_proto_compile(framework_py_proto SRCS attribute.proto op_proto.proto op_desc.proto) # Generate an empty __init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) -cc_library(net SRCS net.cc DEPS op_registry) -cc_test(net_op_test SRCS net_op_test.cc DEPS net) - -cc_library(backward SRCS backward.cc DEPS net) +cc_library(backward SRCS backward.cc DEPS net_op) cc_test(backward_test SRCS backward_test.cc DEPS backward) + +if(WITH_PYTHON) +cc_library(paddle_pybind SHARED + SRCS pybind.cc + DEPS pybind python backward + fc_op + sgd_op + add_op + mean_op + cross_entropy_op + recurrent_op + uniform_random_op + fill_zeros_like_op) +endif(WITH_PYTHON) diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc new file mode 100644 index 0000000000000000000000000000000000000000..4c5790693b7e48396e945d09f4fdc72b86aa5978 --- /dev/null +++ b/paddle/framework/attribute.cc @@ -0,0 +1,85 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/attribute.h" + +#include + +namespace paddle { +namespace framework { + +template <> +AttrType AttrTypeID() { + return INT; +} +template <> +AttrType AttrTypeID() { + return FLOAT; +} +template <> +AttrType AttrTypeID() { + return STRING; +} +template <> +AttrType AttrTypeID>() { + return INTS; +} +template <> +AttrType AttrTypeID>() { + return FLOATS; +} +template <> +AttrType AttrTypeID>() { + return STRINGS; +} + +Attribute GetAttrValue(const AttrDesc& attr_desc) { + switch (attr_desc.type()) { + case paddle::framework::AttrType::INT: { + return attr_desc.i(); + } + case paddle::framework::AttrType::FLOAT: { + return attr_desc.f(); + } + case paddle::framework::AttrType::STRING: { + return attr_desc.s(); + } + case paddle::framework::AttrType::INTS: { + std::vector val(attr_desc.ints_size()); + for (int i = 0; i < attr_desc.ints_size(); ++i) { + val[i] = attr_desc.ints(i); + } + return val; + } + case paddle::framework::AttrType::FLOATS: { + std::vector val(attr_desc.floats_size()); + for (int i = 0; i < attr_desc.floats_size(); ++i) { + val[i] = attr_desc.floats(i); + } + return val; + } + case paddle::framework::AttrType::STRINGS: { + std::vector val(attr_desc.strings_size()); + for (int i = 0; i < attr_desc.strings_size(); ++i) { + val[i] = attr_desc.strings(i); + } + return val; + } + } + PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !"); + return boost::blank(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/attr_checker.h b/paddle/framework/attribute.h similarity index 79% rename from paddle/framework/attr_checker.h rename to paddle/framework/attribute.h index ea5614a45f3a77a851358aff80abbc276c9972ba..3a5820e9c60539e3c771df5da4e82f6c1cae688f 100644 --- a/paddle/framework/attr_checker.h +++ b/paddle/framework/attribute.h @@ -1,3 +1,17 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #pragma once #include @@ -6,6 +20,9 @@ #include #include #include + +#include "paddle/framework/attribute.pb.h" +#include "paddle/framework/op_desc.pb.h" #include "paddle/platform/enforce.h" namespace paddle { @@ -14,13 +31,19 @@ namespace framework { typedef boost::variant, std::vector, std::vector> Attribute; + typedef std::unordered_map AttributeMap; +template +AttrType AttrTypeID(); + +Attribute GetAttrValue(const AttrDesc& attr_desc); + // check whether a value(attribute) fit a certain limit template class LargerThanChecker { public: - LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {} + explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {} void operator()(T& value) const { PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail"); } @@ -35,7 +58,8 @@ class LargerThanChecker { template class DefaultValueSetter { public: - DefaultValueSetter(T default_value) : default_value_(default_value) {} + explicit DefaultValueSetter(T default_value) + : default_value_(default_value) {} void operator()(T& value) const { value = default_value_; } private: @@ -78,7 +102,8 @@ class TypedAttrChecker { typedef std::function ValueChecker; public: - TypedAttrChecker(const std::string& attr_name) : attr_name_(attr_name) {} + explicit TypedAttrChecker(const std::string& attr_name) + : attr_name_(attr_name) {} TypedAttrChecker& InEnum(const std::unordered_set& range) { value_checkers_.push_back(EnumInContainer(range)); diff --git a/paddle/framework/attr_type.proto b/paddle/framework/attribute.proto similarity index 88% rename from paddle/framework/attr_type.proto rename to paddle/framework/attribute.proto index 2d8e0476d710b7ba987d085d828ca13a4ee23707..13ae312c10e934566384b8bd0f41dacd6c01fc2f 100644 --- a/paddle/framework/attr_type.proto +++ b/paddle/framework/attribute.proto @@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -syntax="proto2"; +syntax = "proto2"; package paddle.framework; // Attribute Type for paddle's Op. // Op contains many attributes. Each type of attributes could be different. // The AttrType will be shared between AttrDesc and AttrProto. enum AttrType { - INT = 0; - FLOAT = 1; - STRING = 2; - INTS = 3; - FLOATS = 4; - STRINGS = 5; + INT = 0; + FLOAT = 1; + STRING = 2; + INTS = 3; + FLOATS = 4; + STRINGS = 5; } \ No newline at end of file diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index 0da11b91a7fe4a98e0832f70095c3200956ff001..13706f8b562a1d68fe0d603f51c2fb47b4e18164 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -14,8 +14,8 @@ #include "paddle/framework/backward.h" #include -#include "paddle/framework/net.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/net_op.h" namespace paddle { namespace framework { @@ -32,7 +32,7 @@ static bool AllInSet(const std::vector& names, } static std::shared_ptr NOP() { - auto net_op = std::make_shared(); + auto net_op = std::make_shared(); net_op->type_ = "@NOP@"; net_op->CompleteAddOp(); return net_op; @@ -42,9 +42,9 @@ static std::shared_ptr NOP() { // // no_grad_names the gradient variable names without gradient calculating. // -// uniq_id is a unique index used inside recursively calling BackwardRecursive. -// use `uid = uniq_id++;` to get the unique index, and pass `uniq_id` through -// recursive calling. +// uniq_id is a unique index used inside recursively calling +// BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and +// pass `uniq_id` through recursive calling. // // returns The backward operator. For simple situation, it is a simple // operator. For complex situation, it is a NetOp. @@ -59,32 +59,30 @@ std::shared_ptr BackwardRecursive( // If all input gradients of forwarding operator do not need to calculate, // just return an NOP. Not return null ptr because NOP does not take // too much time for calculation, but it is useful for simplifying logic. - if (AllInSet(forwardOp.inputs_, OperatorBase::GRAD_VAR_SUFFIX(), - no_grad_names)) { + if (AllInSet(forwardOp.inputs_, kGradVarSuffix, no_grad_names)) { return NOP(); } - // All output gradients of forwarding operator do not need to calculate. Then - // all input gradients cannot be computed at all, and we put them into + // All output gradients of forwarding operator do not need to calculate. + // Then all input gradients cannot be computed at all, and we put them into // `no_grad_names` set. Return an NOP. - if (AllInSet(forwardOp.outputs_, OperatorBase::GRAD_VAR_SUFFIX(), - no_grad_names)) { + if (AllInSet(forwardOp.outputs_, kGradVarSuffix, no_grad_names)) { for (auto& name : forwardOp.inputs_) { // Mark all input is not need - no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX()); + no_grad_names.insert(name + kGradVarSuffix); } return NOP(); } // Returned gradient network - auto net = std::make_shared(); + auto net = std::make_shared(); if (forwardOp.IsNetOp()) { // Because forwardOp is a net op, it can static_cast. - auto& forwardNet = static_cast(forwardOp); + auto& forwardNet = static_cast(forwardOp); - // Map from output gradient variable name to operator's indices in backward - // net. That operator generates that variable. + // Map from output gradient variable name to operator's indices in + // backward net. That operator generates that variable. std::unordered_map> dup_output_ops; size_t local_op_id = 0; @@ -134,9 +132,9 @@ std::shared_ptr BackwardRecursive( std::shared_ptr grad_op = OpRegistry::CreateGradOp(forwardOp); for (std::string& grad_input : grad_op->inputs_) { if (no_grad_names.count(grad_input)) { - std::string prefix = grad_input.substr( - 0, grad_input.size() - OperatorBase::GRAD_VAR_SUFFIX().size()); - grad_input = prefix + OperatorBase::ZERO_VAR_SUFFIX(); + std::string prefix = + grad_input.substr(0, grad_input.size() - kGradVarSuffix.size()); + grad_input = prefix + kZeroVarSuffix; // If part of input gradient of that operator is not calculated, fill // zero variables to that input gradient. @@ -147,7 +145,7 @@ std::shared_ptr BackwardRecursive( for (std::string& grad_output : grad_op->outputs_) { if (no_grad_names.count(grad_output)) { - grad_output = OperatorBase::EMPTY_VAR_NAME(); + grad_output = kEmptyVarName; } } @@ -168,11 +166,14 @@ std::shared_ptr Backward( std::unordered_set no_grad_names; no_grad_names.reserve(no_grad_vars.size()); + no_grad_names.insert(kEmptyVarName + kGradVarSuffix); + for (auto& name : no_grad_vars) { - no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX()); + no_grad_names.insert(name + kGradVarSuffix); } size_t uid = 0; return BackwardRecursive(forwardOp, no_grad_names, uid); } + } // namespace framework } // namespace paddle diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc index b095c2c3d5dbf21b5ea70e17475a4aaad9b1db44..6c6e12ca254553a8fc02cadbe3a99989ee848943 100644 --- a/paddle/framework/backward_test.cc +++ b/paddle/framework/backward_test.cc @@ -15,8 +15,9 @@ #include "paddle/framework/backward.h" #include -#include "paddle/framework/net.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/net_op.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace framework { @@ -70,21 +71,21 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker { } }; -class FcOp : public NetOp { +class FcOp : public ops::NetOp { public: void Init() override { AddOp(OpRegistry::CreateOp("mul", {Input("X"), Input("W")}, {Output("mul_result")}, {})); auto b_name = Input("b"); std::string before_act = "mul_result"; - if (b_name != EMPTY_VAR_NAME()) { + if (b_name != kEmptyVarName) { AddOp(OpRegistry::CreateOp("rowwise_add", {Output("mul_result"), b_name}, {Output("add_result")}, {})); before_act = "add_result"; } else { auto out_varname = Output("add_result"); - if (out_varname != EMPTY_VAR_NAME()) { - this->Rename(out_varname, EMPTY_VAR_NAME()); + if (out_varname != kEmptyVarName) { + this->Rename(out_varname, kEmptyVarName); } } @@ -161,14 +162,13 @@ TEST(Backward, simple_op_grad) { auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {}); ASSERT_NE(fwd, nullptr); auto gop = f::OpRegistry::CreateGradOp(*fwd); - ASSERT_EQ(1UL, gop->inputs_.size()); - ASSERT_EQ("Out" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->inputs_[0]); + ASSERT_EQ(4UL, gop->inputs_.size()); + ASSERT_EQ(f::kEmptyVarName, gop->inputs_[0]); ASSERT_EQ("rowwise_add_grad", gop->type_); - ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->outputs_[0]); - ASSERT_EQ("b" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->outputs_[1]); + ASSERT_EQ("X" + f::kGradVarSuffix, gop->outputs_[0]); + ASSERT_EQ("b" + f::kGradVarSuffix, gop->outputs_[1]); - ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(), - gop->Output("X" + f::OperatorBase::GRAD_VAR_SUFFIX())); + ASSERT_EQ("X" + f::kGradVarSuffix, gop->Output("X" + f::kGradVarSuffix)); } TEST(Backward, simple_op_not_need_grad) { @@ -176,13 +176,14 @@ TEST(Backward, simple_op_not_need_grad) { ASSERT_NE(fwd, nullptr); auto gop = f::Backward(*fwd, {"X"}); ASSERT_EQ(std::find(gop->outputs_.begin(), gop->outputs_.end(), - "X" + f::OperatorBase::GRAD_VAR_SUFFIX()), + "X" + f::kGradVarSuffix), gop->outputs_.end()); auto no_input_gop = f::Backward(*fwd, {"X", "b"}); ASSERT_NE(no_input_gop, nullptr); ASSERT_TRUE(no_input_gop->IsNetOp()); - ASSERT_EQ(0UL, std::static_pointer_cast(no_input_gop)->ops_.size()); + ASSERT_EQ(0UL, + std::static_pointer_cast(no_input_gop)->ops_.size()); } TEST(Backward, net_fc_backward_normal) { @@ -191,7 +192,7 @@ TEST(Backward, net_fc_backward_normal) { ASSERT_NE(fwd, nullptr); std::shared_ptr gop = f::Backward(*fwd, {}); ASSERT_TRUE(gop->IsNetOp()); - auto net = static_cast(gop.get()); + auto net = static_cast(gop.get()); ASSERT_NO_THROW(net->DebugString()); @@ -208,13 +209,13 @@ TEST(Backward, net_fc_backward_normal) { } TEST(Backward, net_fc_backward_not_have_b) { - std::shared_ptr fwd = f::OpRegistry::CreateOp( - "fc", {"X", "w", f::OperatorBase::EMPTY_VAR_NAME()}, - {"mul_result", "add_result", "tmp"}, {}); + std::shared_ptr fwd = + f::OpRegistry::CreateOp("fc", {"X", "w", f::kEmptyVarName}, + {"mul_result", "add_result", "tmp"}, {}); ASSERT_NE(fwd, nullptr); std::shared_ptr gop = f::Backward(*fwd, {}); ASSERT_TRUE(gop->IsNetOp()); - auto net = static_cast(gop.get()); + auto net = static_cast(gop.get()); ASSERT_NO_THROW(net->DebugString()); @@ -228,7 +229,7 @@ TEST(Backward, net_fc_backward_not_have_b) { } TEST(Backward, net_input_of_network_not_need_grad) { - f::NetOp net; + ops::NetOp net; net.AddOp(f::OpRegistry::CreateOp("fc", {"X", "W1", "b1"}, {"mul_tmp_0", "add_tmp_0", "hidden0"}, {})); net.AddOp(f::OpRegistry::CreateOp("fc", {"hidden0", "W2", "b2"}, @@ -236,39 +237,36 @@ TEST(Backward, net_input_of_network_not_need_grad) { net.CompleteAddOp(); auto bwd = Backward(net, {"X"}); // X@GRAD is not need. ASSERT_TRUE(bwd->IsNetOp()); - auto bwd_net = static_cast(bwd.get()); + auto bwd_net = static_cast(bwd.get()); std::unordered_set all_output = std::unordered_set( bwd_net->outputs_.begin(), bwd_net->outputs_.end()); - all_output.erase(f::OperatorBase::EMPTY_VAR_NAME()); + all_output.erase(f::kEmptyVarName); for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) { - ASSERT_NE(all_output.find(out + f::OperatorBase::GRAD_VAR_SUFFIX()), - all_output.end()); + ASSERT_NE(all_output.find(out + f::kGradVarSuffix), all_output.end()); } // Not Generated X - ASSERT_EQ(all_output.find("X" + f::OperatorBase::GRAD_VAR_SUFFIX()), - all_output.end()); + ASSERT_EQ(all_output.find("X" + f::kGradVarSuffix), all_output.end()); ASSERT_EQ(2UL, bwd_net->ops_.size()); ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp()); - auto first_fc_grad = static_cast(bwd_net->ops_[1].get()); + auto first_fc_grad = static_cast(bwd_net->ops_[1].get()); ASSERT_EQ(3UL, first_fc_grad->ops_.size()); - ASSERT_EQ( - f::OperatorBase::EMPTY_VAR_NAME(), - first_fc_grad->ops_[2]->Output("A" + f::OperatorBase::GRAD_VAR_SUFFIX())); + ASSERT_EQ(f::kEmptyVarName, + first_fc_grad->ops_[2]->Output("A" + f::kGradVarSuffix)); } TEST(Backward, net_shared_weight) { - f::NetOp net; + ops::NetOp net; net.AddOp(f::OpRegistry::CreateOp("mul", {"X", "W"}, {"Out"}, {})); net.AddOp(f::OpRegistry::CreateOp("mul", {"Out", "W"}, {"FinalOut"}, {})); net.CompleteAddOp(); auto bwd = f::Backward(net, {}); ASSERT_TRUE(bwd->IsNetOp()); - auto bwd_net = static_cast(bwd.get()); + auto bwd_net = static_cast(bwd.get()); ASSERT_EQ(3UL, bwd_net->ops_.size()); ASSERT_EQ("add", bwd_net->ops_[2]->type_); } @@ -285,7 +283,7 @@ TEST(Backward, op_all_input_are_not_need) { auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {}); auto backward = f::Backward(*fwd, {"X", "b"}); ASSERT_TRUE(backward->IsNetOp()); - auto net = static_cast(backward.get()); + auto net = static_cast(backward.get()); ASSERT_TRUE(net->ops_.empty()); } @@ -293,7 +291,7 @@ TEST(Backward, op_all_output_are_not_need) { auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {}); auto backward = f::Backward(*fwd, {"Out"}); ASSERT_TRUE(backward->IsNetOp()); - auto net = static_cast(backward.get()); + auto net = static_cast(backward.get()); ASSERT_TRUE(net->ops_.empty()); } @@ -301,7 +299,7 @@ TEST(Backward, op_part_of_output_are_not_need) { auto fwd = f::OpRegistry::CreateOp("many_output_op", {"X"}, {"Y", "Z"}, {}); auto backward = f::Backward(*fwd, {"Z"}); ASSERT_TRUE(backward->IsNetOp()); - auto net = static_cast(backward.get()); + auto net = static_cast(backward.get()); ASSERT_EQ(net->ops_.size(), 2UL); auto &fill_zero = *net->ops_[0]; @@ -309,17 +307,15 @@ TEST(Backward, op_part_of_output_are_not_need) { ASSERT_EQ(1UL, fill_zero.inputs_.size()); ASSERT_EQ("Z", fill_zero.inputs_[0]); ASSERT_EQ(1UL, fill_zero.outputs_.size()); - ASSERT_EQ("Z" + f::OperatorBase::ZERO_VAR_SUFFIX(), fill_zero.outputs_[0]); + ASSERT_EQ("Z" + f::kZeroVarSuffix, fill_zero.outputs_[0]); auto &d_many_out = *net->ops_[1]; ASSERT_EQ("many_output_op_grad", d_many_out.type_); ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.inputs_.size()); // I/O/OG - ASSERT_EQ("Z" + f::OperatorBase::ZERO_VAR_SUFFIX(), - d_many_out.Input("z" + f::OperatorBase::GRAD_VAR_SUFFIX())); - ASSERT_EQ("Y" + f::OperatorBase::GRAD_VAR_SUFFIX(), - d_many_out.Input("y" + f::OperatorBase::GRAD_VAR_SUFFIX())); - ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(), - d_many_out.Output("x" + f::OperatorBase::GRAD_VAR_SUFFIX())); + ASSERT_EQ("Z" + f::kZeroVarSuffix, d_many_out.Input("z" + f::kGradVarSuffix)); + ASSERT_EQ("Y" + f::kGradVarSuffix, d_many_out.Input("y" + f::kGradVarSuffix)); + ASSERT_EQ("X" + f::kGradVarSuffix, + d_many_out.Output("x" + f::kGradVarSuffix)); } TEST(Backward, op_part_of_input_are_not_need) { @@ -329,19 +325,17 @@ TEST(Backward, op_part_of_input_are_not_need) { ASSERT_EQ(grad_mul.type_, "mul_grad"); ASSERT_EQ(grad_mul.inputs_.size(), 2UL + 1UL + 1UL); ASSERT_EQ(grad_mul.outputs_.size(), 2UL); - ASSERT_EQ(grad_mul.Output("A" + f::OperatorBase::GRAD_VAR_SUFFIX()), - f::OperatorBase::EMPTY_VAR_NAME()); - ASSERT_EQ(grad_mul.Output("B" + f::OperatorBase::GRAD_VAR_SUFFIX()), - "b" + f::OperatorBase::GRAD_VAR_SUFFIX()); - ASSERT_EQ(grad_mul.Input("Out" + f::OperatorBase::GRAD_VAR_SUFFIX()), - "out" + f::OperatorBase::GRAD_VAR_SUFFIX()); + ASSERT_EQ(grad_mul.Output("A" + f::kGradVarSuffix), f::kEmptyVarName); + ASSERT_EQ(grad_mul.Output("B" + f::kGradVarSuffix), "b" + f::kGradVarSuffix); + ASSERT_EQ(grad_mul.Input("Out" + f::kGradVarSuffix), + "out" + f::kGradVarSuffix); ASSERT_EQ(grad_mul.Input("A"), "a"); ASSERT_EQ(grad_mul.Input("B"), "b"); ASSERT_EQ(grad_mul.Input("Out"), "out"); } TEST(Backward, linear_net_intermediate_variable_has_no_grad) { - f::NetOp net; + ops::NetOp net; net.AddOp(f::OpRegistry::CreateOp("fc", {"x1", "w1", "b1"}, {"mul_out1", "add_out1", "out1"}, {})); net.AddOp(f::OpRegistry::CreateOp("fc", {"out1", "w2", "b2"}, @@ -351,14 +345,13 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) { net.CompleteAddOp(); auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"}); ASSERT_TRUE(backward->IsNetOp()); - auto bwd_net = static_cast(backward.get()); + auto bwd_net = static_cast(backward.get()); ASSERT_EQ(bwd_net->ops_.size(), 3UL); auto &grad_fc = *bwd_net->ops_[0]; EXPECT_EQ(grad_fc.inputs_.size(), 3UL /* external input number */ + 1UL /* external output number*/ + 1UL /* number of gradient of external output*/ - - 1UL /*ignoreGradient varable number*/ + 2U /* internal variable number*/); EXPECT_EQ(grad_fc.outputs_.size(), 2UL /* input number of mul*/ + 2UL /* input number of rowwise_add */ @@ -367,23 +360,4 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) { EXPECT_EQ(bwd_net->ops_[1]->outputs_.size(), 0UL); EXPECT_EQ(bwd_net->ops_[2]->inputs_.size(), 0UL); EXPECT_EQ(bwd_net->ops_[2]->outputs_.size(), 0UL); - - /* - EXPECT_EQ(grad_fc.Output("X" + f::OperatorBase::GRAD_VAR_SUFFIX()), - f::OperatorBase::EMPTY_VAR_NAME()); - EXPECT_EQ(grad_fc.Output("W" + f::OperatorBase::GRAD_VAR_SUFFIX()), - "w3" + f::OperatorBase::GRAD_VAR_SUFFIX()); - EXPECT_EQ(grad_fc.Output("b" + f::OperatorBase::GRAD_VAR_SUFFIX()), - "b3" + f::OperatorBase::GRAD_VAR_SUFFIX()); - EXPECT_EQ(grad_fc.Output("mul_result" + f::OperatorBase::GRAD_VAR_SUFFIX()), - "mul_out3" + f::OperatorBase::GRAD_VAR_SUFFIX()); - - EXPECT_EQ(grad_fc.Input("Out" + f::OperatorBase::GRAD_VAR_SUFFIX()), - "out3" + f::OperatorBase::GRAD_VAR_SUFFIX()); - EXPECT_EQ(grad_fc.Input("X"), "out2"); - EXPECT_EQ(grad_fc.Input("W"), "w3"); - EXPECT_EQ(grad_fc.Input("mul_result"), "mul_out3"); - EXPECT_EQ(grad_fc.Input("add_result"), "tmp_out3"); - EXPECT_EQ(grad_fc.Input("Out"), "out3"); - */ } diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h index 9fcc657edcd5459d0a42a64d708603a4bcd53cf0..5aa5af0c19be5a209c760282cb1a090fc57a53ad 100644 --- a/paddle/framework/ddim.h +++ b/paddle/framework/ddim.h @@ -25,18 +25,15 @@ limitations under the License. */ namespace paddle { namespace framework { -namespace { -typedef boost::variant, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>, - Dim<8>, Dim<9>> - DDimVar; -} - /** * \brief A dynamically sized dimension. * * The number of dimensions must be between [1, 9]. */ struct DDim { + typedef boost::variant, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>, + Dim<8>, Dim<9>> + DDimVar; DDimVar var; DDim() : var(Dim<1>()) {} diff --git a/paddle/framework/grad_op_builder.cc b/paddle/framework/grad_op_builder.cc index dd686cc78246f06cdc3ec7d013086863d7e8fac0..6d032fb78f099f5142d64e531d1a03c10ed5e68e 100644 --- a/paddle/framework/grad_op_builder.cc +++ b/paddle/framework/grad_op_builder.cc @@ -8,107 +8,95 @@ You may obtain a copy of the License at Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +WITHOpArgType::OUT WARRANTIES OR CONDITIONS OF ANY KOpArgType::IND, either +express or implied. See the License for the specific language governing +permissions and limitations under the License. */ #include "paddle/framework/grad_op_builder.h" +#include "paddle/framework/op_proto.pb.h" #include "paddle/framework/op_registry.h" namespace paddle { namespace framework { -OperatorBase* GradOpBuilder::Build() { - BuildOpInOutArgList(); - std::string grad_op_type = OpRegistry::grad_ops().at(op_.type_); - OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)(); - grad_op->type_ = grad_op_type; - CompleteGradOp(grad_op); - return grad_op; -} +class OpRegistry; + +using VarIndexMap = std::unordered_map; -OpInOutArg* GradOpBuilder::BuildArg(const VarProto& var, - const VarIndexMap& var_map, - const std::vector& format, - InOutType type) { - int idx = var_map.at(var.name()); - int begin_idx = format.empty() ? idx : format.at(idx); - int end_idx = format.empty() ? idx + 1 : format.at(idx + 1); - return new OpInOutArg(var.name(), type, !var.ignore_gradient(), begin_idx, - end_idx); +enum class OpArgType { IN, OUT }; + +static std::vector* GetOpFormat(OperatorBase* op, const OpArgType& type) { + std::string key = type == OpArgType::IN ? "input_format" : "output_format"; + return op->attrs_.count(key) + ? &boost::get>(op->attrs_.at(key)) + : nullptr; } -void GradOpBuilder::BuildOpInOutArgList() { - const OpProto& op_proto = OpRegistry::protos().at(op_.type_); - const auto& var_map = *(OpRegistry::VarIndexMaps().at(op_.type_)); - const std::vector& in_format = - op_.attrs_.count("input_format") - ? op_.GetAttr>("input_format") - : std::vector(); - const std::vector& out_format = - op_.attrs_.count("output_format") - ? op_.GetAttr>("output_format") - : std::vector(); - for (const auto& var : op_proto.inputs()) { - arg_list_.emplace_back( - std::shared_ptr(BuildArg(var, var_map, in_format, IN))); - } - for (const auto& var : op_proto.outputs()) { - arg_list_.emplace_back( - std::shared_ptr(BuildArg(var, var_map, out_format, OUT))); - } +static const std::vector* GetOpFormat(const OperatorBase* op, + const OpArgType& type) { + std::string key = type == OpArgType::IN ? "input_format" : "output_format"; + return op->attrs_.count(key) + ? &boost::get>(op->attrs_.at(key)) + : nullptr; } -void GradOpBuilder::AddArgIntoGradOp(const OpInOutArg* arg, - std::vector& in_out, - std::vector& format, - VarIndexMap* varmap, int& idx, - bool is_grad) const { - std::string var_name = arg->proto_name_; - if (is_grad) { - var_name += OperatorBase::GRAD_VAR_SUFFIX(); - } - (*varmap)[var_name] = idx++; - size_t pre_sz = in_out.size(); - auto base_it = arg->type_ == IN ? op_.inputs_.begin() : op_.outputs_.begin(); - std::copy(base_it + arg->begin_idx_, base_it + arg->end_idx_, - std::back_inserter(in_out)); - if (is_grad) { - for (size_t i = pre_sz; i < in_out.size(); ++i) { - in_out[i] += OperatorBase::GRAD_VAR_SUFFIX(); +static void TransOpArg(const OperatorBase* src_op, OperatorBase* dst_op, + const OpArgType& src_type, const OpArgType& dst_type, + int& idx, bool is_grad) { + const std::vector& src_inout = + src_type == OpArgType::IN ? src_op->inputs_ : src_op->outputs_; + const std::vector* src_format = GetOpFormat(src_op, src_type); + + std::vector& dst_inout = + dst_type == OpArgType::IN ? dst_op->inputs_ : dst_op->outputs_; + std::vector* dst_format = GetOpFormat(dst_op, dst_type); + const OpProto& proto = OpRegistry::protos().at(src_op->type_); + const auto& src_arg_list = + src_type == OpArgType::IN ? proto.inputs() : proto.outputs(); + + for (const auto& arg : src_arg_list) { + std::string src_name = arg.name(); + std::string dst_name = is_grad ? src_name + kGradVarSuffix : src_name; + (*dst_op->in_out_idxs_)[dst_name] = idx++; + int src_arg_idx = src_op->in_out_idxs_->at(src_name); + int src_begin = + src_format == nullptr ? src_arg_idx : src_format->at(src_arg_idx); + int src_end = src_format == nullptr ? src_arg_idx + 1 + : src_format->at(src_arg_idx + 1); + for (int i = src_begin; i < src_end; ++i) { + std::string s = + is_grad ? src_inout[i] + kGradVarSuffix + : (arg.ignore_gradient() ? kEmptyVarName : src_inout[i]); + dst_inout.emplace_back(s); + } + if (dst_format != nullptr) { + dst_format->push_back(dst_inout.size()); } } - format.push_back(in_out.size()); } -void GradOpBuilder::CompleteGradOp(OperatorBase* grad_op) const { - grad_op->attrs_ = op_.attrs_; +OperatorBase* BuildGradOp(const OperatorBase* op) { + std::string grad_op_type = OpRegistry::grad_ops().at(op->type_); + OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)(); + grad_op->type_ = grad_op_type; + grad_op->attrs_ = op->attrs_; grad_op->attrs_.erase("input_format"); grad_op->attrs_.erase("output_format"); - VarIndexMap* grad_varmap = new VarIndexMap(); + if (GetOpFormat(op, OpArgType::IN) != nullptr) { + grad_op->attrs_["output_format"] = std::vector({0}); + } + if (GetOpFormat(op, OpArgType::IN) != nullptr || + GetOpFormat(op, OpArgType::OUT) != nullptr) { + grad_op->attrs_["input_format"] = std::vector({0}); + } + grad_op->in_out_idxs_.reset(new VarIndexMap()); int in_idx = 0; int out_idx = 0; - std::vector in_format({0}); - std::vector out_format({0}); - for (const auto& arg : arg_list_) { - // op_'s inputs_ and outputs_ - if (arg->needed_in_grad_) { - AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap, - in_idx, false); - } - if (arg->type_ == IN) { - // gradients of op_'s inputs_ - AddArgIntoGradOp(arg.get(), grad_op->outputs_, out_format, grad_varmap, - out_idx, true); - } else { - // gradients of op_'s outputs_ - AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap, - in_idx, true); - } - } - grad_op->attrs_["input_format"] = in_format; - grad_op->attrs_["output_format"] = out_format; - grad_op->in_out_idxs_.reset(grad_varmap); + TransOpArg(op, grad_op, OpArgType::IN, OpArgType::IN, in_idx, false); // I + TransOpArg(op, grad_op, OpArgType::OUT, OpArgType::IN, in_idx, false); // G + TransOpArg(op, grad_op, OpArgType::OUT, OpArgType::IN, in_idx, true); // OG + TransOpArg(op, grad_op, OpArgType::IN, OpArgType::OUT, out_idx, true); // IG + return grad_op; } } // namespace framework diff --git a/paddle/framework/grad_op_builder.h b/paddle/framework/grad_op_builder.h index cc7a76f3726e00a08fbe06bca4c9b9f5bad466b4..998f8ebbb5f2f4fb8b7e938b5916afd0f8a7930d 100644 --- a/paddle/framework/grad_op_builder.h +++ b/paddle/framework/grad_op_builder.h @@ -1,48 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #pragma once -#include "paddle/framework/op_proto.pb.h" #include "paddle/framework/operator.h" namespace paddle { namespace framework { -class OpRegistry; - -enum InOutType { IN, OUT }; - -struct OpInOutArg { - OpInOutArg(const std::string& proto_name, const InOutType& type, - bool needed_in_grad, size_t begin_idx, size_t end_idx) - : proto_name_(proto_name), - type_(type), - needed_in_grad_(needed_in_grad), - begin_idx_(begin_idx), - end_idx_(end_idx) {} - - std::string proto_name_; - InOutType type_; - bool needed_in_grad_; - size_t begin_idx_; - size_t end_idx_; -}; - -class GradOpBuilder { - using VarIndexMap = std::unordered_map; - - public: - GradOpBuilder(const OperatorBase& op) : op_(op) {} - OperatorBase* Build(); - - private: - OpInOutArg* BuildArg(const VarProto& var, const VarIndexMap& var_map, - const std::vector& format, InOutType type); - void BuildOpInOutArgList(); - void AddArgIntoGradOp(const OpInOutArg* arg, std::vector& in_out, - std::vector& format, VarIndexMap* varmap, int& idx, - bool is_grad) const; - void CompleteGradOp(OperatorBase* grad_op) const; - const OperatorBase& op_; - std::vector> arg_list_; -}; + +OperatorBase* BuildGradOp(const OperatorBase* op); } // namespace framework } // namespace paddle diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc index e9cf3b9798db2cbfb8d26259ae9a6741fbae8278..cf7143eba4460e5619188b82ffe23db11a04a236 100644 --- a/paddle/framework/grad_op_builder_test.cc +++ b/paddle/framework/grad_op_builder_test.cc @@ -8,10 +8,49 @@ USE_OP(add_two); namespace paddle { namespace framework { +class NOP : public OperatorBase { + public: + void InferShape(const Scope &scope) const override {} + void Run(const Scope &scope, + const platform::DeviceContext &dev_ctx) const override {} +}; + +class MutiInOutOpMaker : public OpProtoAndCheckerMaker { + public: + MutiInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("In1", "a single input"); + AddInput("In2_mult", "a multiple input").SetMultiple(); + AddInput("In3", "another single input"); + AddOutput("Out1", "a single output"); + AddOutput("Out2_mult", "a multiple output").SetMultiple(); + AddComment("test op with multiple inputs and outputs"); + } +}; + +class IOIgnoredOpMaker : public OpProtoAndCheckerMaker { + public: + IOIgnoredOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("In1", "a single input"); + AddInput("In2_mult", "a multiple input").SetMultiple().IgnoreGradient(); + AddInput("In3_mult", "another multiple input").SetMultiple(); + AddOutput("Out1_mult", "a multiple output").SetMultiple(); + AddOutput("Out2", "a single output").IgnoreGradient(); + AddComment("op with inputs and outputs ignored in gradient calculating"); + } +}; + +} // namespace framework +} // namespace paddle + +namespace f = paddle::framework; + TEST(GradOpBuilder, AddTwo) { - std::shared_ptr add_op( - OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {})); - std::shared_ptr grad_add_op = OpRegistry::CreateGradOp(*add_op); + std::shared_ptr add_op( + f::OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {})); + std::shared_ptr grad_add_op = + f::OpRegistry::CreateGradOp(*add_op); EXPECT_EQ(static_cast(grad_add_op->inputs_.size()), 4); EXPECT_EQ(static_cast(grad_add_op->outputs_.size()), 2); EXPECT_EQ(grad_add_op->Input("X"), "x"); @@ -22,5 +61,77 @@ TEST(GradOpBuilder, AddTwo) { EXPECT_EQ(grad_add_op->Output("Y@GRAD"), "y@GRAD"); } -} // namespace framework -} // namespace paddle \ No newline at end of file +REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker); +REGISTER_GRADIENT_OP(mult_io, mult_io_grad, f::NOP); +REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker); +REGISTER_GRADIENT_OP(io_ignored, io_ignored_grad, f::NOP); + +TEST(GradOpBuilder, MutiInOut) { + f::AttributeMap attrs{{"input_format", std::vector{0, 1, 4, 5}}, + {"output_format", std::vector{0, 1, 3}}}; + std::shared_ptr test_op(f::OpRegistry::CreateOp( + "mult_io", {"in1", "in2_1", "in2_2", "in2_3", "in3"}, + {"out1", "out2_1", "out2_2"}, attrs)); + std::shared_ptr grad_test_op = + f::OpRegistry::CreateGradOp(*test_op); + + ASSERT_EQ(grad_test_op->inputs_.size(), 5UL + 3UL + 3UL); + EXPECT_EQ(grad_test_op->Input("In1"), "in1"); + EXPECT_EQ(grad_test_op->Inputs("In2_mult"), + std::vector({"in2_1", "in2_2", "in2_3"})); + EXPECT_EQ(grad_test_op->Input("In3"), "in3"); + EXPECT_EQ(grad_test_op->Input("Out1"), "out1"); + EXPECT_EQ(grad_test_op->Inputs("Out2_mult"), + std::vector({"out2_1", "out2_2"})); + EXPECT_EQ(grad_test_op->Input("Out1" + f::kGradVarSuffix), + "out1" + f::kGradVarSuffix); + EXPECT_EQ(grad_test_op->Inputs("Out2_mult" + f::kGradVarSuffix), + std::vector( + {"out2_1" + f::kGradVarSuffix, "out2_2" + f::kGradVarSuffix})); + + ASSERT_EQ(grad_test_op->outputs_.size(), 5UL); + EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix), + "in1" + f::kGradVarSuffix); + EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix), + std::vector({"in2_1" + f::kGradVarSuffix, + "in2_2" + f::kGradVarSuffix, + "in2_3" + f::kGradVarSuffix})); + EXPECT_EQ(grad_test_op->Output("In3" + f::kGradVarSuffix), + "in3" + f::kGradVarSuffix); +} + +TEST(GradOpBuilder, IOIgnoredInGradient) { + f::AttributeMap attrs{{"input_format", std::vector{0, 1, 3, 5}}, + {"output_format", std::vector{0, 2, 3}}}; + std::shared_ptr test_op(f::OpRegistry::CreateOp( + "io_ignored", {"in1", "in2_1", "in2_2", "in3_1", "in3_2"}, + {"out1_1", "out1_2", "out2"}, attrs)); + std::shared_ptr grad_test_op = + f::OpRegistry::CreateGradOp(*test_op); + + // 'In2' and 'Out2' are ignored in gradient calculating + ASSERT_EQ(grad_test_op->inputs_.size(), 5UL + 3UL + 3UL); + EXPECT_EQ(grad_test_op->Input("In1"), "in1"); + EXPECT_EQ(grad_test_op->Inputs("In2_mult"), + std::vector({f::kEmptyVarName, f::kEmptyVarName})); + EXPECT_EQ(grad_test_op->Inputs("In3_mult"), + std::vector({"in3_1", "in3_2"})); + EXPECT_EQ(grad_test_op->Inputs("Out1_mult"), + std::vector({"out1_1", "out1_2"})); + EXPECT_EQ(grad_test_op->Input("Out2"), f::kEmptyVarName); + EXPECT_EQ(grad_test_op->Inputs("Out1_mult" + f::kGradVarSuffix), + std::vector( + {"out1_1" + f::kGradVarSuffix, "out1_2" + f::kGradVarSuffix})); + EXPECT_EQ(grad_test_op->Input("Out2" + f::kGradVarSuffix), + "out2" + f::kGradVarSuffix); + + ASSERT_EQ(grad_test_op->outputs_.size(), 5UL); + EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix), + "in1" + f::kGradVarSuffix); + EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix), + std::vector( + {"in2_1" + f::kGradVarSuffix, "in2_2" + f::kGradVarSuffix})); + EXPECT_EQ(grad_test_op->Outputs("In3_mult" + f::kGradVarSuffix), + std::vector( + {"in3_1" + f::kGradVarSuffix, "in3_2" + f::kGradVarSuffix})); +} diff --git a/paddle/framework/op_desc.proto b/paddle/framework/op_desc.proto index 89497f3c16bc28aa93b25a83c1f2eccafdf1c5b4..d95ba26f88ae181f991440e0df30c80f80a7eb2a 100644 --- a/paddle/framework/op_desc.proto +++ b/paddle/framework/op_desc.proto @@ -12,24 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -syntax="proto2"; +syntax = "proto2"; package paddle.framework; -import "attr_type.proto"; +import "attribute.proto"; // AttrDesc is used to describe Attributes of an Operator. It contain's // name, type, and value of Attribute. // // e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0 message AttrDesc { - required string name = 1; - required AttrType type = 2; - optional int32 i = 3; - optional float f = 4; - optional string s = 5; - repeated int32 ints = 6; - repeated float floats = 7; - repeated string strings = 8; + required string name = 1; + required AttrType type = 2; + optional int32 i = 3; + optional float f = 4; + optional string s = 5; + repeated int32 ints = 6; + repeated float floats = 7; + repeated string strings = 8; }; // Protocol Message to describe an Operator. @@ -42,15 +42,15 @@ message AttrDesc { // 3rd-party language can build this proto message and call // AddOp(const OpDesc& op_desc) of Paddle core to create an Operator. message OpDesc { - // input names of this Operator. - repeated string inputs = 1; + // input names of this Operator. + repeated string inputs = 1; - // output names of this Operator. - repeated string outputs = 2; + // output names of this Operator. + repeated string outputs = 2; - // type of this Operator, such as "add", "sub", "fc". - required string type = 3; + // type of this Operator, such as "add", "sub", "fc". + required string type = 3; - // Attributes of this Operator. e.g., scale=3.0 in cosine op. - repeated AttrDesc attrs = 4; + // Attributes of this Operator. e.g., scale=3.0 in cosine op. + repeated AttrDesc attrs = 4; }; \ No newline at end of file diff --git a/paddle/framework/op_proto.proto b/paddle/framework/op_proto.proto index 366c84e53dc29e41eefbaef0a6452e01c4fe37bd..52292162874b9ca207fb0d3917df41ade096b143 100644 --- a/paddle/framework/op_proto.proto +++ b/paddle/framework/op_proto.proto @@ -15,100 +15,102 @@ limitations under the License. */ // Protocol Message for 3rd-party language binding. // // Paddle Python package will use `OpProto` to generate op creation methods. -// The op creation methods take user's input and generate `OpDesc` proto message, +// The op creation methods take user's input and generate `OpDesc` proto +// message, // then pass `OpDesc` to C++ side and create Op pointer. // -syntax="proto2"; +syntax = "proto2"; package paddle.framework; -import "attr_type.proto"; +import "attribute.proto"; // Attribute protocol message for 3rd-party language binding. // It will store the Op support what attribute and what type. message AttrProto { - // Supported attribute name. e.g. `scale` for cosine op. - required string name = 1; + // Supported attribute name. e.g. `scale` for cosine op. + required string name = 1; - // Supported attribute type. - required AttrType type = 2; + // Supported attribute type. + required AttrType type = 2; - // Supported attribute comments. It helps 3rd-party language generate doc-string. - required string comment = 3; + // Supported attribute comments. It helps 3rd-party language generate + // doc-string. + required string comment = 3; - // If that attribute is generated, it means the Paddle third language - // binding has responsibility to fill that attribute. End-User should - // not set that attribute. - optional bool generated = 4 [default=false]; + // If that attribute is generated, it means the Paddle third language + // binding has responsibility to fill that attribute. End-User should + // not set that attribute. + optional bool generated = 4 [ default = false ]; } // Input or output message for 3rd-party language binding. // It contains parameter name and its comments. message VarProto { - // Input or output name in that op creation function. - // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names. - required string name = 1; - - // The comment for that input. It helps 3rd-party language generate doc-string. - required string comment = 2; - - // Is that input/output could be a list or not. - // If so, that Op should write a attributed named `input_format` or - // `output_format`. - // - // e.g. - // If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W` - // could be multiple, so the multiple of `X` and `W` is True, and OpDesc - // will hold a attribute of them. - // - // The Op desc of same fc could be - // { - // "type": "fc", - // "input": ["X1", "X2", "W1", "W2", "b"], - // "output": "fc.out", - // "attrs" : { - // "input_format": [0, 2, 4, 5] - // } - // } - // - optional bool multiple = 3 [default=false]; - - // It marks that output is a temporary output. That output is not used by - // user, but used by other op internally as input. If other op is not use - // that output, it could be optimized early. - // - // Attribute temporary_index will be set in OpDesc if there is some - // outputs are temporary. - // - // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"], - // attrs = { - // "temporary_index": [1] - // } - optional bool temporary = 4 [default=false]; - - // The gradient of operator can be ignored immediately - // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2 - // can be ignored for the future optimized on graph. - optional bool ignore_gradient = 6; + // Input or output name in that op creation function. + // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names. + required string name = 1; + + // The comment for that input. It helps 3rd-party language generate + // doc-string. + required string comment = 2; + + // Is that input/output could be a list or not. + // If so, that Op should write a attributed named `input_format` or + // `output_format`. + // + // e.g. + // If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W` + // could be multiple, so the multiple of `X` and `W` is True, and OpDesc + // will hold a attribute of them. + // + // The Op desc of same fc could be + // { + // "type": "fc", + // "input": ["X1", "X2", "W1", "W2", "b"], + // "output": "fc.out", + // "attrs" : { + // "input_format": [0, 2, 4, 5] + // } + // } + // + optional bool multiple = 3 [ default = false ]; + + // It marks that output is a temporary output. That output is not used by + // user, but used by other op internally as input. If other op is not use + // that output, it could be optimized early. + // + // Attribute temporary_index will be set in OpDesc if there is some + // outputs are temporary. + // + // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"], + // attrs = { + // "temporary_index": [1] + // } + optional bool temporary = 4 [ default = false ]; + + // The gradient of operator can be ignored immediately + // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2 + // can be ignored for the future optimized on graph. + optional bool ignore_gradient = 6; } // Op protocol message for 3rd-party language binding. // It contains all information for generating op creation method. message OpProto { - // The input information to generate op creation method. - repeated VarProto inputs = 1; + // The input information to generate op creation method. + repeated VarProto inputs = 1; - // The output information to generate op creation method. - repeated VarProto outputs = 2; + // The output information to generate op creation method. + repeated VarProto outputs = 2; - // The attribute information to generate op creation method. - repeated AttrProto attrs = 3; + // The attribute information to generate op creation method. + repeated AttrProto attrs = 3; - // The comments for that Op. It helps 3rd-party language generate - // doc-string. The whole documentation of that Op is generated by comment, - // inputs, outputs, attrs together. - required string comment = 4; - - // The type of that Op. - required string type = 5; + // The comments for that Op. It helps 3rd-party language generate + // doc-string. The whole documentation of that Op is generated by comment, + // inputs, outputs, attrs together. + required string comment = 4; + // The type of that Op. + required string type = 5; } diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc index 1d14535c50b542733663a6900a8b5f2033290ea6..1caa02a2a1d046778f875d04eeaef957be741302 100644 --- a/paddle/framework/op_registry.cc +++ b/paddle/framework/op_registry.cc @@ -14,37 +14,8 @@ limitations under the License. */ #include -namespace paddle { -namespace framework { - -template <> -void AttrTypeHelper::SetAttrType(AttrProto* attr) { - attr->set_type(paddle::framework::AttrType::INT); -} - -template <> -void AttrTypeHelper::SetAttrType(AttrProto* attr) { - attr->set_type(paddle::framework::AttrType::FLOAT); -} - -template <> -void AttrTypeHelper::SetAttrType(AttrProto* attr) { - attr->set_type(paddle::framework::AttrType::STRING); -} +#include -template <> -void AttrTypeHelper::SetAttrType>(AttrProto* attr) { - attr->set_type(paddle::framework::AttrType::INTS); -} - -template <> -void AttrTypeHelper::SetAttrType>(AttrProto* attr) { - attr->set_type(paddle::framework::AttrType::FLOATS); -} - -template <> -void AttrTypeHelper::SetAttrType>(AttrProto* attr) { - attr->set_type(paddle::framework::AttrType::STRINGS); -} -} // namespace framework +namespace paddle { +namespace framework {} // namespace framework } // namespace paddle diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index f10c9297981a4c6aefc6c2072d0ac2b8e562a7a0..6c26183818a9d6996e3d3ce2af74ba36f4711eca 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -19,7 +19,7 @@ limitations under the License. */ #include #include #include -#include "paddle/framework/attr_checker.h" +#include "paddle/framework/attribute.h" #include "paddle/framework/grad_op_builder.h" #include "paddle/framework/op_desc.pb.h" #include "paddle/framework/scope.h" @@ -27,49 +27,6 @@ limitations under the License. */ namespace paddle { namespace framework { -// helper class to set attribute type -struct AttrTypeHelper { - template - static void SetAttrType(AttrProto* attr); - - static Attribute GetAttrValue(const AttrDesc& attr_desc) { - switch (attr_desc.type()) { - case paddle::framework::AttrType::INT: { - return attr_desc.i(); - } - case paddle::framework::AttrType::FLOAT: { - return attr_desc.f(); - } - case paddle::framework::AttrType::STRING: { - return attr_desc.s(); - } - case paddle::framework::AttrType::INTS: { - std::vector val(attr_desc.ints_size()); - for (int i = 0; i < attr_desc.ints_size(); ++i) { - val[i] = attr_desc.ints(i); - } - return val; - } - case paddle::framework::AttrType::FLOATS: { - std::vector val(attr_desc.floats_size()); - for (int i = 0; i < attr_desc.floats_size(); ++i) { - val[i] = attr_desc.floats(i); - } - return val; - } - case paddle::framework::AttrType::STRINGS: { - std::vector val(attr_desc.strings_size()); - for (int i = 0; i < attr_desc.strings_size(); ++i) { - val[i] = attr_desc.strings(i); - } - return val; - } - } - PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !"); - return boost::blank(); - } -}; - // this class not only make proto but also init attribute checkers. class OpProtoAndCheckerMaker { public: @@ -136,7 +93,7 @@ class OpProtoAndCheckerMaker { *attr->mutable_name() = name; *attr->mutable_comment() = comment; attr->set_generated(generated); - AttrTypeHelper::SetAttrType(attr); + attr->set_type(AttrTypeID()); return op_checker_->AddAttrChecker(name); } @@ -297,7 +254,7 @@ class OpRegistry { AttributeMap attrs; for (auto& attr : op_desc.attrs()) { - attrs[attr.name()] = AttrTypeHelper::GetAttrValue(attr); + attrs[attr.name()] = GetAttrValue(attr); } return CreateOp(op_desc.type(), inputs, outputs, attrs); @@ -306,8 +263,7 @@ class OpRegistry { static std::shared_ptr CreateGradOp(const OperatorBase& op) { PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops"); - GradOpBuilder builder(op); - std::shared_ptr grad_op(builder.Build()); + std::shared_ptr grad_op(BuildGradOp(&op)); grad_op->Init(); return grad_op; } @@ -315,7 +271,7 @@ class OpRegistry { static std::unordered_map& protos() { static std::unordered_map protos_; return protos_; - }; + } static std::unordered_map& grad_ops() { static std::unordered_map grad_ops_; @@ -337,12 +293,12 @@ class OpRegistry { static std::unordered_map& op_checkers() { static std::unordered_map op_checkers_; return op_checkers_; - }; + } static void GenerateTempVariableName(OperatorBase* op) { static std::atomic gUniqId(0UL); for (auto& outname : op->outputs_) { - if (outname == OperatorBase::TMP_VAR_NAME()) { + if (outname == kTempVarName) { outname += op->type_; outname += "@"; outname += std::to_string(gUniqId.fetch_add(1)); @@ -354,7 +310,7 @@ class OpRegistry { template class OpRegisterHelper { public: - OpRegisterHelper(const char* op_type) { + explicit OpRegisterHelper(const char* op_type) { OpRegistry::RegisterOp(op_type); } }; @@ -400,6 +356,14 @@ class GradOpRegisterHelper { return 0; \ } +/** + * Macro to Forbid user register Gradient Operator. + */ +#define NO_GRADIENT(__op_type) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_gradient_op__##__op_type##__op_type##_grad, \ + "NO_GRADIENT must be in global namespace") + /** * Macro to Register OperatorKernel. */ diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index cfe9cba308556475ef64b45e7178dfc418761598..d9a013b883abdec4422806f90e36da7410a4fa0c 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -20,22 +20,22 @@ namespace paddle { namespace framework { template <> -Eigen::DefaultDevice* ExecutionContext::GetEigenDevice< +Eigen::DefaultDevice& ExecutionContext::GetEigenDevice< platform::CPUPlace, Eigen::DefaultDevice>() const { - return device_context_.get_eigen_device(); + return *device_context_->get_eigen_device(); } #ifndef PADDLE_ONLY_CPU template <> -Eigen::GpuDevice* +Eigen::GpuDevice& ExecutionContext::GetEigenDevice() const { - return device_context_.get_eigen_device(); + return *device_context_->get_eigen_device(); } #endif const std::string& OperatorBase::Input(const std::string& name) const { - PADDLE_ENFORCE(in_out_idxs_ != nullptr, - "Input Output Indices could not be nullptr"); + PADDLE_ENFORCE_NOT_NULL(in_out_idxs_, + "Input Output Indices could not be nullptr"); auto it = in_out_idxs_->find(name); PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_", name); @@ -49,7 +49,7 @@ const std::string& OperatorBase::Input(const std::string& name) const { } std::vector OperatorBase::Inputs(const std::string& name) const { - PADDLE_ENFORCE(in_out_idxs_ != nullptr, "IO Idx could not be nullptr"); + PADDLE_ENFORCE_NOT_NULL(in_out_idxs_, "IO Idx could not be nullptr"); auto input_format = GetAttr>("input_format"); auto offset = in_out_idxs_->at(name); PADDLE_ENFORCE(input_format.at(static_cast(offset) + 1) <= @@ -62,7 +62,7 @@ std::vector OperatorBase::Inputs(const std::string& name) const { } const std::string& OperatorBase::Output(const std::string& name) const { - PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr"); + PADDLE_ENFORCE_NOT_NULL(in_out_idxs_, "InOut Indice could not be nullptr"); auto it = in_out_idxs_->find(name); PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_", name); @@ -76,7 +76,7 @@ const std::string& OperatorBase::Output(const std::string& name) const { } std::vector OperatorBase::Outputs(const std::string& name) const { - PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr"); + PADDLE_ENFORCE_NOT_NULL(in_out_idxs_, "InOut Indice could not be nullptr"); auto output_format = GetAttr>("output_format"); auto offset = in_out_idxs_->at(name); PADDLE_ENFORCE(output_format.at(static_cast(offset) + 1) <= diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index 0832a663dd01fe2921366d70599bc867e73af47c..c324fa6702de1eabab3f75cbf4e6568c99b60470 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -20,7 +20,7 @@ limitations under the License. */ #include #include -#include "paddle/framework/attr_checker.h" +#include "paddle/framework/attribute.h" #include "paddle/framework/op_desc.pb.h" #include "paddle/framework/op_proto.pb.h" #include "paddle/framework/scope.h" @@ -32,9 +32,29 @@ limitations under the License. */ namespace paddle { namespace framework { +/// If a variable is a empty variable, that name will be used. +const std::string kEmptyVarName = "@EMPTY@"; + +/// If a variable is a temporary variable, that name will be set in Python, +/// but it will be convert to a unique name in scope after OpCreator. +const std::string kTempVarName = "@TEMP@"; + +/// If a variable's name has a certain suffix, it means that the +/// variable is the gradient of another varibale. +/// e.g. Variable "x@GRAD" is the gradient of varibale "x". +const std::string kGradVarSuffix = "@GRAD"; + +/// Variables with this suffix are supposed to be filled up with zeros. +const std::string kZeroVarSuffix = "@ZERO"; + +inline std::string GradVarName(const std::string& var_name) { + return var_name + kGradVarSuffix; +} + class OperatorBase; class InferShapeContext; class ExecutionContext; + /** * OperatorBase has the basic element that Net will call to do computation. * Only CreateOperator from OpRegistry will new Operator directly. User @@ -43,21 +63,6 @@ class ExecutionContext; */ class OperatorBase { public: - /// If a variable is a empty variable, that name will be used. - static std::string EMPTY_VAR_NAME() { return "@EMPTY@"; } - - /// If a variable is a temporary variable, that name will be set in Python, - /// but it will be convert to a unique name in scope after OpCreator. - static std::string TMP_VAR_NAME() { return "@TEMP@"; } - - /// If a variable's name has a certain suffix, it means that the - /// variable is the gradient of another varibale. - /// e.g. Variable "x@GRAD" is the gradient of varibale "x". - static std::string GRAD_VAR_SUFFIX() { return "@GRAD"; } - - /// Variables with this suffix are supposed to be filled up with zeros. - static std::string ZERO_VAR_SUFFIX() { return "@ZERO"; } - virtual ~OperatorBase() {} template @@ -83,6 +88,8 @@ class OperatorBase { virtual bool IsNetOp() const { return false; } + virtual bool SupportGPU() const { return false; } + /// rename inputs outputs name void Rename(const std::string& old_name, const std::string& new_name); @@ -162,28 +169,32 @@ class OperatorContext { template const T* Input(const size_t index) const { auto var = InputVar(index); - PADDLE_ENFORCE(var != nullptr, "Input(%d) should not be nullptr", index); + PADDLE_ENFORCE_NOT_NULL(var, "Input(%d) should not be nullptr", index); return &var->Get(); } template T* Output(const size_t index) const { auto var = OutputVar(index); - PADDLE_ENFORCE(var != nullptr, "Output(%d) should not be nullptr", index); + PADDLE_ENFORCE_NOT_NULL( + var, + "Output(%d) not be nullptr, which means variable [%s] does not " + "exist in scope", + index, op_.outputs_[index]); return var->GetMutable(); } template const T* Input(const std::string& name) const { auto var = InputVar(name); - PADDLE_ENFORCE(var != nullptr, "Input(%s) should not be nullptr", name); + PADDLE_ENFORCE_NOT_NULL(var, "Input(%s) should not be nullptr", name); return &var->Get(); } template T* Output(const std::string& name) const { auto var = OutputVar(name); - PADDLE_ENFORCE(var != nullptr, "Output(%s) should not be nullptr", name); + PADDLE_ENFORCE_NOT_NULL(var, "Output(%s) should not be nullptr", name); return var->GetMutable(); } @@ -195,9 +206,9 @@ class OperatorContext { std::transform(names.begin(), names.end(), std::back_inserter(res), [&](const std::string& sub_name) { auto var = scope_.FindVar(sub_name); - PADDLE_ENFORCE(var != nullptr, - "MultiInput(%s:%s) should not be nullptr", - name, sub_name); + PADDLE_ENFORCE_NOT_NULL( + var, "MultiInput(%s:%s) should not be nullptr", name, + sub_name); return &var->Get(); }); return res; @@ -211,9 +222,9 @@ class OperatorContext { std::transform(names.begin(), names.end(), std::back_inserter(res), [&](const std::string& sub_name) { auto var = scope_.FindVar(sub_name); - PADDLE_ENFORCE(var != nullptr, - "MultiOutput(%s:%s) should not be nullptr", - name, sub_name); + PADDLE_ENFORCE_NOT_NULL( + var, "MultiOutput(%s:%s) should not be nullptr", name, + sub_name); return var->GetMutable(); }); return res; @@ -247,17 +258,17 @@ struct EigenDeviceConverter { class ExecutionContext : public OperatorContext { public: ExecutionContext(const OperatorBase* op, const Scope& scope, - const platform::DeviceContext& device_context) + const platform::DeviceContext* device_context) : OperatorContext(op, scope), device_context_(device_context) {} template ::EigenDeviceType> - DeviceType* GetEigenDevice() const; + DeviceType& GetEigenDevice() const; - platform::Place GetPlace() const { return device_context_.GetPlace(); } + platform::Place GetPlace() const { return device_context_->GetPlace(); } - const platform::DeviceContext& device_context_; + const platform::DeviceContext* device_context_; }; class OpKernel { @@ -280,7 +291,7 @@ class OperatorWithKernel : public OperatorBase { platform::Place place_; OpKernelKey() = default; - OpKernelKey(const platform::DeviceContext& dev_ctx) { + explicit OpKernelKey(const platform::DeviceContext& dev_ctx) { place_ = dev_ctx.GetPlace(); } @@ -299,14 +310,14 @@ class OperatorWithKernel : public OperatorBase { using OpKernelMap = std::unordered_map, OpKernelHash>; - void InferShape(const Scope& scope) const { + void InferShape(const Scope& scope) const override { InferShape(InferShapeContext(this, scope)); } void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const final { auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx)); - opKernel->Compute(ExecutionContext(this, scope, dev_ctx)); + opKernel->Compute(ExecutionContext(this, scope, &dev_ctx)); } static std::unordered_map& @@ -315,6 +326,12 @@ class OperatorWithKernel : public OperatorBase { return g_all_op_kernels; } + bool SupportGPU() const override { + OperatorWithKernel::OpKernelKey key; + key.place_ = platform::GPUPlace(); + return OperatorWithKernel::AllOpKernels().at(type_).count(key) != 0; + } + protected: virtual void InferShape(const InferShapeContext& ctx) const = 0; }; diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc index 6a6a802b7da05c37a317540030836baa28a89cd7..387aada749ba62246b44dedc050547c05955caa9 100644 --- a/paddle/framework/operator_test.cc +++ b/paddle/framework/operator_test.cc @@ -157,22 +157,22 @@ class CPUKernalMultiInputsTest : public OpKernel { ASSERT_EQ(xs[2], "x2"); auto inVar0 = ctx.MultiInputVar("xs"); - ASSERT_EQ(inVar0.size(), 3); + ASSERT_EQ(inVar0.size(), 3U); auto intVar1 = ctx.InputVar("k"); ASSERT_NE(intVar1, nullptr); auto outVar0 = ctx.MultiOutputVar("ys"); - ASSERT_EQ(outVar0.size(), 2); + ASSERT_EQ(outVar0.size(), 2U); auto inTensor0 = ctx.MultiInput("xs"); - ASSERT_EQ(inTensor0.size(), 3); + ASSERT_EQ(inTensor0.size(), 3U); auto intTensor1 = ctx.Input("k"); ASSERT_NE(intTensor1, nullptr); auto outTensor0 = ctx.MultiOutput("ys"); - ASSERT_EQ(outTensor0.size(), 2); + ASSERT_EQ(outTensor0.size(), 2U); auto k = ctx.op_.Input("k"); ASSERT_EQ(k, "k0"); diff --git a/paddle/framework/pybind.cc b/paddle/framework/pybind.cc new file mode 100644 index 0000000000000000000000000000000000000000..e17d0874a938bc615638e78dd4a1a3cc2a9f0878 --- /dev/null +++ b/paddle/framework/pybind.cc @@ -0,0 +1,260 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/framework/backward.h" +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" +#include "paddle/framework/scope.h" +#include "paddle/framework/tensor_py.h" +#include "paddle/operators/net_op.h" +#include "paddle/operators/type_alias.h" +#include "paddle/platform/enforce.h" +#include "paddle/platform/place.h" +#include "pybind11/numpy.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace py = pybind11; + +USE_OP(add_two); +USE_OP_CPU(onehot_cross_entropy); +USE_OP_WITHOUT_KERNEL(fc); +USE_OP(sgd); +USE_OP(mul); +USE_OP(mean); +USE_OP(sigmoid); +USE_OP(softmax); +USE_OP(rowwise_add); +USE_OP(fill_zeros_like); +USE_OP_WITHOUT_KERNEL(recurrent_op); +USE_OP(uniform_random); +namespace paddle { +namespace framework { +template +void ExposeOperator(ClassType &m) { + m.def("infer_shape", &ClassType::type::InferShape) + .def("run", &ClassType::type::Run) + .def("type", + [](const typename ClassType::type &op) -> std::string { + return op.type_; + }) + .def("outputs", + [](const typename ClassType::type &op) -> std::vector { + return op.outputs_; + }) + .def("inputs", + [](const typename ClassType::type &op) -> std::vector { + return op.inputs_; + }) + .def("support_gpu", &ClassType::type::SupportGPU) + .def("temp_outputs", + [](const typename ClassType::type &op) -> std::vector { + auto iter = op.attrs_.find("temporary_index"); + std::vector ret; + if (iter == op.attrs_.end()) { + return ret; + } else { + auto tmp_idx = boost::get>(iter->second); + ret.reserve(tmp_idx.size()); + for (auto &index : tmp_idx) { + ret.push_back(op.outputs_.at(index)); + } + return ret; + } + }) + .def("__str__", &ClassType::type::DebugString); +} + +static size_t UniqueIntegerGenerator() { + static std::atomic generator; + return generator.fetch_add(1); +} + +bool IsCompileGPU() { +#ifdef PADDLE_ONLY_CPU + return false; +#else + return true; +#endif +} + +PYBIND11_PLUGIN(core) { + py::module m("core", "C++ core of PaddlePaddle"); + + py::class_(m, "Tensor", py::buffer_protocol()) + .def_buffer( + [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) + .def("get_dims", + [](const Tensor &self) { return vectorize(self.dims()); }) + .def("set_dims", + [](Tensor &self, const std::vector &dim) { + self.Resize(make_ddim(dim)); + }) + .def("alloc_float", + [](Tensor &self, paddle::platform::GPUPlace &place) { + self.mutable_data(place); + }) + .def("alloc_float", + [](Tensor &self, paddle::platform::CPUPlace &place) { + self.mutable_data(place); + }) + .def("alloc_int", + [](Tensor &self, paddle::platform::CPUPlace &place) { + self.mutable_data(place); + }) + .def("alloc_int", + [](Tensor &self, paddle::platform::GPUPlace &place) { + self.mutable_data(place); + }) + .def("set", PyCPUTensorSetFromArray) + .def("set", PyCPUTensorSetFromArray) +#ifndef PADDLE_ONLY_CPU + .def("set", PyCUDATensorSetFromArray) + .def("set", PyCUDATensorSetFromArray) +#endif + .def("shape", [](Tensor &self) { return vectorize(self.dims()); }) + .def("set_float_element", + [](Tensor &self, size_t offset, float f) { + // TODO(yuyang18): Only support GPU now. + self.data()[offset] = f; + }) + .def("get_float_element", [](Tensor &self, size_t offset) -> float { + // TODO(yuyang18): Only support GPU now. + return self.data()[offset]; + }); + + py::class_(m, "Variable", R"DOC(Variable Class. + +All parameter, weight, gradient are variables in Paddle. +)DOC") + .def("is_int", [](const Variable &var) { return var.IsType(); }) + .def("set_int", + [](Variable &var, int val) -> void { *var.GetMutable() = val; }) + .def("get_int", [](const Variable &var) -> int { return var.Get(); }) + .def("get_tensor", + [](Variable &self) -> Tensor * { return self.GetMutable(); }, + py::return_value_policy::reference) + .def("get_net", + [](Variable &self) -> ops::NetOp * { + return self.GetMutable(); + }, + py::return_value_policy::reference); + + py::class_(m, "Scope", "") + .def("new_var", + [](Scope &self, const std::string &name) -> Variable * { + return self.NewVar(name); + }, + py::return_value_policy::reference) + .def("find_var", &Scope::FindVar, py::return_value_policy::reference) + .def(py::init<>()) + .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); }, + py::return_value_policy::reference) + .def("drop_kids", &Scope::DropKids); + + //! @note: Be careful! PyBind will return std::string as an unicode, not + //! Python str. If you want a str object, you should cast them in Python. + m.def("get_all_op_protos", []() -> std::vector { + auto &protos = OpRegistry::protos(); + std::vector ret_values; + for (auto it = protos.begin(); it != protos.end(); ++it) { + PADDLE_ENFORCE(it->second.IsInitialized(), + "OpProto must all be initialized"); + std::string str; + PADDLE_ENFORCE(it->second.SerializeToString(&str), + "Serialize OpProto Error. This could be a bug of Paddle."); + ret_values.push_back(py::bytes(str)); + } + return ret_values; + }); + m.def_submodule( + "var_names", + "The module will return special predefined variable name in Paddle") + .def("empty", []() { return kEmptyVarName; }) + .def("temp", []() { return kTempVarName; }); + // clang-format off + py::class_(m, "DeviceContext") + .def_static("create", + [](paddle::platform::CPUPlace& place) + -> paddle::platform::DeviceContext* { + return new paddle::platform::CPUDeviceContext(); + }) + .def_static("create", + [](paddle::platform::GPUPlace& place) + -> paddle::platform::DeviceContext* { +#ifdef PADDLE_ONLY_CPU + PADDLE_THROW("GPUPlace is not supported in CPU device."); +#else + return new paddle::platform::CUDADeviceContext(place); +#endif + }); + // clang-format on + + py::class_(m, "GPUPlace").def(py::init()); + + py::class_(m, "CPUPlace").def(py::init<>()); + + py::class_> operator_base( + m, "Operator"); + + operator_base.def_static("create", [](py::bytes protobin) { + OpDesc desc; + PADDLE_ENFORCE(desc.ParsePartialFromString(protobin), + "Cannot parse user input to OpDesc"); + PADDLE_ENFORCE(desc.IsInitialized(), + "User OpDesc is not initialized, reason %s", + desc.InitializationErrorString()); + return OpRegistry::CreateOp(desc); + }); + + operator_base.def("backward", + [](const OperatorBase &forwardOp, + const std::unordered_set &no_grad_vars) { + return Backward(forwardOp, no_grad_vars); + }); + + ExposeOperator(operator_base); + + py::class_> net(m, "Net"); + + net.def_static("create", + []() -> std::shared_ptr { + auto retv = std::make_shared(); + retv->type_ = "plain_net"; + return retv; + }) + .def("add_op", &ops::NetOp::AddOp) + .def( + "add_op", + [](ops::NetOp &self, const std::shared_ptr &net) -> void { + self.AddOp(std::static_pointer_cast(net)); + }) + .def("complete_add_op", &ops::NetOp::CompleteAddOp) + .def("complete_add_op", + [](std::shared_ptr &self) { self->CompleteAddOp(); }); + + ExposeOperator(net); + + m.def("unique_integer", UniqueIntegerGenerator); + + m.def("is_compile_gpu", IsCompileGPU); + + return m.ptr(); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index 76070f636b0971f4a136042e056c59adb5dc2d40..c44df05e4b0fceed858fbf4f68eddc407a44c894 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -26,19 +26,17 @@ limitations under the License. */ #include "unsupported/Eigen/CXX11/Tensor" namespace paddle { -namespace pybind { -namespace details { // forward declare -template -struct CastToPyBufferImpl; -} // namespace details -} // namespace pybind namespace framework { +namespace details { +template +struct CastToPyBufferImpl; +} class Tensor { public: template - friend struct paddle::pybind::details::CastToPyBufferImpl; + friend struct details::CastToPyBufferImpl; template friend struct EigenTensor; @@ -129,8 +127,8 @@ class Tensor { memory::PODDeleter(place)), place_(place), size_(size) { - PADDLE_ENFORCE(ptr_ != nullptr, "Insufficient %s memory to allocation.", - is_cpu_place(place_) ? "CPU" : "GPU"); + PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", + (is_cpu_place(place_) ? "CPU" : "GPU")); } virtual size_t size() const { return size_; } @@ -167,4 +165,4 @@ class Tensor { } // namespace framework } // namespace paddle -#include "paddle/framework/detail/tensor-inl.h" +#include "paddle/framework/tensor_impl.h" diff --git a/paddle/framework/detail/tensor-inl.h b/paddle/framework/tensor_impl.h similarity index 80% rename from paddle/framework/detail/tensor-inl.h rename to paddle/framework/tensor_impl.h index e7ff09dd5c954378afeca299e901277c3ebdb96a..8d9bec6dc9c3f0af822a0d8cd8588dc932970652 100644 --- a/paddle/framework/detail/tensor-inl.h +++ b/paddle/framework/tensor_impl.h @@ -13,19 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - #include "paddle/memory/memcpy.h" +#include "paddle/platform/enforce.h" namespace paddle { namespace framework { template inline void Tensor::check_memory_size() const { - PADDLE_ENFORCE(holder_ != nullptr, - "Tenosr holds no memory. Call Tensor::mutable_data first."); - PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_, - "Tensor's dims_ is out of bound. Call Tensor::mutable_data " - "first to re-allocate memory."); + PADDLE_ENFORCE_NOT_NULL( + holder_, "Tenosr holds no memory. Call Tensor::mutable_data first."); + PADDLE_ENFORCE_GE(holder_->size(), product(dims_) * sizeof(T) + offset_, + "Tensor's dims_ is out of bound. Call Tensor::mutable_data " + "first to re-allocate memory."); } template @@ -52,9 +52,9 @@ inline T* Tensor::mutable_data(DDim dims, platform::Place place) { template inline T* Tensor::mutable_data(platform::Place place) { static_assert(std::is_pod::value, "T must be POD"); - PADDLE_ENFORCE(product(dims_) > 0, - "Tensor's numel must be larger than zero to call " - "Tensor::mutable_data. Call Tensor::set_dim first."); + PADDLE_ENFORCE_GT(product(dims_), 0, + "Tensor's numel must be larger than zero to call " + "Tensor::mutable_data. Call Tensor::set_dim first."); /* some versions of boost::variant don't have operator!= */ size_t size = product(dims_) * sizeof(T); if (holder_ == nullptr || !(holder_->place() == place) || @@ -62,9 +62,11 @@ inline T* Tensor::mutable_data(platform::Place place) { if (platform::is_cpu_place(place)) { holder_.reset(new PlaceholderImpl( boost::get(place), size)); + } else if (platform::is_gpu_place(place)) { +#ifdef PADDLE_ONLY_CPU + PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); } -#ifndef PADDLE_ONLY_CPU - else if (platform::is_gpu_place(place)) { +#else holder_.reset(new PlaceholderImpl( boost::get(place), size)); } @@ -119,11 +121,11 @@ inline void Tensor::CopyFrom(const Tensor& src, template inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const { check_memory_size(); - PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero."); - PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound."); - PADDLE_ENFORCE(begin_idx < end_idx, - "Begin index must be less than end index."); - PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1."); + PADDLE_ENFORCE_GE(begin_idx, 0, "Slice begin index is less than zero."); + PADDLE_ENFORCE_LE(end_idx, dims_[0], "Slice end index is out of bound."); + PADDLE_ENFORCE_LT(begin_idx, end_idx, + "Begin index must be less than end index."); + PADDLE_ENFORCE_NE(dims_[0], 1, "Can not slice a tensor with dims_[0] = 1."); int base = product(dims_) / dims_[0]; Tensor dst; dst.holder_ = holder_; diff --git a/paddle/pybind/tensor_bind.h b/paddle/framework/tensor_py.h similarity index 64% rename from paddle/pybind/tensor_bind.h rename to paddle/framework/tensor_py.h index 995e102bf9d342e1604f5ae704288d6cf68d97a4..4e1ab77b157fe1adaeac55c271c056236f2d40de 100644 --- a/paddle/pybind/tensor_bind.h +++ b/paddle/framework/tensor_py.h @@ -13,15 +13,17 @@ limitations under the License. */ #pragma once -#include -#include -#include +#include +#include "paddle/framework/tensor.h" +#include "paddle/memory/memcpy.h" +#include "pybind11/numpy.h" +#include "pybind11/pybind11.h" namespace py = pybind11; namespace paddle { -namespace pybind { +namespace framework { namespace details { @@ -40,9 +42,6 @@ template struct CastToPyBufferImpl { using CUR_TYPE = typename std::tuple_element>::type; py::buffer_info operator()(framework::Tensor &tensor) { - PADDLE_ENFORCE(paddle::platform::is_cpu_place(tensor.holder_->place()), - "Only CPU tensor can cast to numpy array"); - if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) { auto dim_vec = framework::vectorize(tensor.dims()); std::vector dims_outside; @@ -56,14 +55,16 @@ struct CastToPyBufferImpl { strides[i - 1] = sizeof(CUR_TYPE) * prod; prod *= dims_outside[i - 1]; } - + framework::Tensor dst_tensor; + if (paddle::platform::is_gpu_place(tensor.holder_->place())) { + dst_tensor.CopyFrom(tensor, platform::CPUPlace()); + } else if (paddle::platform::is_cpu_place(tensor.holder_->place())) { + dst_tensor = tensor; + } return py::buffer_info( - tensor.mutable_data(tensor.holder_->place()), - sizeof(CUR_TYPE), - py::format_descriptor::format(), - (size_t)framework::arity(tensor.dims()), - dims_outside, - strides); + dst_tensor.mutable_data(dst_tensor.holder_->place()), + sizeof(CUR_TYPE), py::format_descriptor::format(), + (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides); } else { constexpr bool less = I + 1 < std::tuple_size>::value; return CastToPyBufferImpl()(tensor); @@ -77,9 +78,10 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) { } template -void PyTensorSetFromArray( +void PyCPUTensorSetFromArray( framework::Tensor &self, - py::array_t array) { + py::array_t array, + paddle::platform::CPUPlace &place) { std::vector dims; dims.reserve(array.ndim()); for (size_t i = 0; i < array.ndim(); ++i) { @@ -87,9 +89,28 @@ void PyTensorSetFromArray( } self.Resize(framework::make_ddim(dims)); - auto *dst = self.mutable_data(paddle::platform::CPUPlace()); + auto *dst = self.mutable_data(place); std::memcpy(dst, array.data(), sizeof(T) * array.size()); } +#ifndef PADDLE_ONLY_CPU +template +void PyCUDATensorSetFromArray( + framework::Tensor &self, + py::array_t array, + paddle::platform::GPUPlace &place) { + std::vector dims; + dims.reserve(array.ndim()); + for (size_t i = 0; i < array.ndim(); ++i) { + dims.push_back((int)array.shape()[i]); + } + + self.Resize(framework::make_ddim(dims)); + auto *dst = self.mutable_data(place); + paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(), + cudaMemcpyHostToDevice); +} +#endif + } // namespace pybind } // namespace paddle diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index ef1cc10b840896d9ab97f963fc12a4971cd74e1f..20276181b974bb5b3d6cb40fb5e6c1295cf1c02f 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -36,7 +36,8 @@ TEST(Tensor, DataAssert) { } catch (paddle::platform::EnforceNotMet err) { caught = true; std::string msg = - "Tenosr holds no memory. Call Tensor::mutable_data first."; + "holder_ should not be null\nTenosr holds no memory. Call " + "Tensor::mutable_data first."; const char* what = err.what(); for (size_t i = 0; i < msg.length(); ++i) { ASSERT_EQ(what[i], msg[i]); @@ -111,7 +112,8 @@ TEST(Tensor, ShareDataWith) { } catch (paddle::platform::EnforceNotMet err) { caught = true; std::string msg = - "Tenosr holds no memory. Call Tensor::mutable_data first."; + "holder_ should not be null\nTenosr holds no memory. Call " + "Tensor::mutable_data first."; const char* what = err.what(); for (size_t i = 0; i < msg.length(); ++i) { ASSERT_EQ(what[i], msg[i]); diff --git a/paddle/function/BlockExpandOpTest.cpp b/paddle/function/BlockExpandOpTest.cpp index 5e4897e72ba9fab2dd9e25d90313dc1b4d38e2d4..59193a3ec3d0fabe7c841372394204ab568f5a2b 100644 --- a/paddle/function/BlockExpandOpTest.cpp +++ b/paddle/function/BlockExpandOpTest.cpp @@ -18,10 +18,10 @@ limitations under the License. */ namespace paddle { TEST(BlockExpandForward, real) { - for (size_t batchSize : {5, 32}) { - for (size_t channels : {1, 5, 32}) { - for (size_t inputHeight : {5, 33, 100}) { - for (size_t inputWidth : {5, 32, 96}) { + for (size_t batchSize : {5}) { + for (size_t channels : {1, 5}) { + for (size_t inputHeight : {5, 33}) { + for (size_t inputWidth : {5, 32}) { for (size_t block : {1, 3, 5}) { for (size_t stride : {1, 2}) { for (size_t padding : {0, 1}) { @@ -61,10 +61,10 @@ TEST(BlockExpandForward, real) { } TEST(BlockExpandBackward, real) { - for (size_t batchSize : {5, 32}) { - for (size_t channels : {1, 5, 32}) { - for (size_t inputHeight : {5, 33, 100}) { - for (size_t inputWidth : {5, 32, 96}) { + for (size_t batchSize : {5}) { + for (size_t channels : {1, 5}) { + for (size_t inputHeight : {5, 33}) { + for (size_t inputWidth : {5, 32}) { for (size_t block : {1, 3, 5}) { for (size_t stride : {1, 2}) { for (size_t padding : {0, 1}) { diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp index 1744f377808f137dcda4a28acce336dc22be3d01..6b8e1e2da9775ccd03c84cc86ad226f3c00ab7fe 100644 --- a/paddle/function/BufferArgTest.cpp +++ b/paddle/function/BufferArgTest.cpp @@ -32,7 +32,7 @@ TEST(BufferTest, SequenceIdArg) { sizeOfValuType(VALUE_TYPE_INT32)); SequenceIdArg buffer(memory.getBuf(), shape); EXPECT_EQ(buffer.data(), memory.getBuf()); - EXPECT_EQ(buffer.numSeqs(), 9); + EXPECT_EQ(buffer.numSeqs(), 9U); } } // namespace paddle diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu index 1a5b4042402df3081a493962a5e080d72b7f40b2..4492dea5d8a6f8580a13f3059401c87fa2164085 100644 --- a/paddle/function/ContextProjectionOpGpu.cu +++ b/paddle/function/ContextProjectionOpGpu.cu @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "ContextProjectionOp.h" +#include "hl_base.h" namespace paddle { @@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input, int block_size = blockDim.x; int sequenceId = blockIdx.x; int seq_start = sequence[sequenceId]; - int seq_end = sequence[sequenceId+1]; + int seq_end = sequence[sequenceId + 1]; real value = 0; int instances = seq_end - seq_start + context_length - 1; @@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input, } else if ((i + context_start) >= (seq_end - seq_start)) { if (padding) { value = - weight[(begin_pad + i + context_start - (seq_end - seq_start)) * - input_dim + idx]; + weight[(begin_pad + i + context_start - (seq_end - seq_start)) * + input_dim + + idx]; } else { continue; } @@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input, int outx = (i - context_length) < 0 ? i : (context_length - 1); int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1)); real* output_r = - output + outy * input_dim * context_length + outx * input_dim; + output + outy * input_dim * context_length + outx * input_dim; for (int j = outy; j < seq_end - seq_start; j++) { output_r[idx] += value; if (j - outy == outx) break; @@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input, dim3 grid(blocks_x, blocks_y); if (weight) { - KeContextProjectionForward<<< grid, threads, 0, STREAM_DEFAULT >>> - (input, sequence, weight, output, input_dim, - context_length, context_start, begin_pad); - } else { - KeContextProjectionForward<<< grid, threads, 0, STREAM_DEFAULT >>> - (input, sequence, weight, output, input_dim, - context_length, context_start, begin_pad); + KeContextProjectionForward<<>>( + input, + sequence, + weight, + output, + input_dim, + context_length, + context_start, + begin_pad); + } else { + KeContextProjectionForward<<>>( + input, + sequence, + weight, + output, + input_dim, + context_length, + context_start, + begin_pad); } CHECK_SYNC("hl_context_projection_forward failed"); } @@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad, int block_size = blockDim.x; int sequenceId = blockIdx.x; int seq_start = sequence[sequenceId]; - int seq_end = sequence[sequenceId+1]; + int seq_end = sequence[sequenceId + 1]; real value = 0; int instances = seq_end - seq_start + context_length - 1; @@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad, int outx = (i - context_length) < 0 ? i : (context_length - 1); int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1)); real* output_r = - out + outy * input_dim * context_length + outx * input_dim; + out + outy * input_dim * context_length + outx * input_dim; for (int j = outy; j < seq_end - seq_start; j++) { value += output_r[idx]; if (j - outy == outx) break; @@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad, int blocks_y = 1; dim3 threads(block_size, 1); dim3 grid(blocks_x, blocks_y); - KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>> - (out_grad, sequence, input_grad, input_dim, context_length, context_start); + KeContextProjectionBackwardData<<>>( + out_grad, sequence, input_grad, input_dim, context_length, context_start); CHECK_SYNC("hl_context_projection_backward_data failed"); } @@ -231,7 +244,7 @@ void ContextProjectionBackwardData(const GpuMatrix& out_grad, context_start); } -template +template __global__ void KeContextProjectionBackwardWeight(const real* out_grad, const int* sequence, real* w_grad, @@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, if (weight_idx < w_dim) { for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) { int seq_start = sequence[seqId]; - int seq_end = sequence[seqId+1]; - output_r = const_cast(out_grad) - + seq_start * w_dim * context_length; + int seq_end = sequence[seqId + 1]; + output_r = + const_cast(out_grad) + seq_start * w_dim * context_length; if (context_start < 0) { if (padId + context_start < 0) { instanceId = padId; } else { // begin_pad > 0; - instanceId = (padId - begin_pad) + - (seq_end - seq_start) - context_start; + instanceId = + (padId - begin_pad) + (seq_end - seq_start) - context_start; } } else { if (padId + (seq_end - seq_start) < context_start) { @@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, } } - int outx = (instanceId - context_length) < 0 ? - instanceId : (context_length - 1); - int outy = (instanceId - context_length) < 0 ? - 0 : (instanceId - (context_length - 1)); + int outx = + (instanceId - context_length) < 0 ? instanceId : (context_length - 1); + int outy = (instanceId - context_length) < 0 + ? 0 + : (instanceId - (context_length - 1)); output_r += outy * w_dim * context_length + outx * w_dim; for (int j = outy; j < seq_end - seq_start; j++) { value += output_r[weight_idx]; @@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, } __syncthreads(); - for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) { + for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) { if (idy < stride) { sum_s[idy][idx] += sum_s[idy + stride][idx]; } @@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad, dim3 threads(threads_x, threads_y); dim3 grid(blocks_x, 1); - KeContextProjectionBackwardWeight<32, 32> - <<< grid, threads, 0, STREAM_DEFAULT >>> - (out_grad, sequence, w_grad, num_sequences, w_dim, - context_length, context_start, begin_pad); + KeContextProjectionBackwardWeight<32, + 32><<>>( + out_grad, + sequence, + w_grad, + num_sequences, + w_dim, + context_length, + context_start, + begin_pad); CHECK_SYNC("hl_context_projection_backward_weight failed"); } template <> -void ContextProjectionBackwardWeight( - const GpuMatrix& out_grad, - GpuMatrix& w_grad, - const GpuIVector& seq_vec, - size_t context_length, - int context_start, - size_t total_pad, - size_t begin_pad) { +void ContextProjectionBackwardWeight(const GpuMatrix& out_grad, + GpuMatrix& w_grad, + const GpuIVector& seq_vec, + size_t context_length, + int context_start, + size_t total_pad, + size_t begin_pad) { hl_context_projection_backward_weight(out_grad.getData(), seq_vec.getData(), w_grad.getData(), @@ -376,23 +395,18 @@ void ContextProjectionBackward(const GpuMatrix& out_grad, size_t begin_pad, bool is_padding, size_t total_pad) { - if (in_grad) { - ContextProjectionBackwardData( - out_grad, - in_grad, - sequence, - context_length, - context_start); - } - if (is_padding && w_grad) { - ContextProjectionBackwardWeight( - out_grad, - w_grad, - sequence, - context_length, - context_start, - total_pad, - begin_pad); + if (in_grad) { + ContextProjectionBackwardData( + out_grad, in_grad, sequence, context_length, context_start); + } + if (is_padding && w_grad) { + ContextProjectionBackwardWeight(out_grad, + w_grad, + sequence, + context_length, + context_start, + total_pad, + begin_pad); } } diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h index bb4f48364b9b454af7d37fe4d3c340666e53285c..baf78bc6c88d0d294f4457b81c52b22e425d9fdb 100644 --- a/paddle/function/ConvOp.h +++ b/paddle/function/ConvOp.h @@ -109,6 +109,13 @@ protected: return filter[filter.ndims() - 1]; } + // determine whether im2col needs to be performed + inline bool isNeedIm2col(const TensorShape& filter) const { + return !(getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 && + strideH() == 1 && strideW() == 1 && paddingH() == 0 && + paddingW() == 0); + } + std::vector strides_; std::vector paddings_; diff --git a/paddle/function/CosSimOpGpu.cu b/paddle/function/CosSimOpGpu.cu index c62ab39551f02288618244871ae31c6800df5b42..a1f88f479b5818e3864129a4dac723bceed76fcf 100644 --- a/paddle/function/CosSimOpGpu.cu +++ b/paddle/function/CosSimOpGpu.cu @@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "CosSimOp.h" #include "hl_base.h" #include "hl_device_functions.cuh" -#include "CosSimOp.h" namespace paddle { -template +template __global__ void KeCosSim(real* output, const real* input1, const real* input2, @@ -78,8 +78,8 @@ void hlCossim(real* output, dim3 threads(block_size, 1); dim3 grid(1, input1_height); - KeCosSim<<>> - (output, input1, input2, width, input1_height, input2_height, scale); + KeCosSim<<>>( + output, input1, input2, width, input1_height, input2_height, scale); CHECK_SYNC("hlCossim failed"); } @@ -99,7 +99,7 @@ void CosSimForward(GpuMatrix& out_mat, hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale); } -template +template __global__ void KeCosSimDerivative(const real* grad, const real* output, const real* prev_out_x, @@ -148,14 +148,13 @@ __global__ void KeCosSimDerivative(const real* grad, if (xy[0] == 0) { real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0])); for (int index = tid; index < width; index += block_size) { - prev_grad_x[index] += - scale * grad[ty] * prev_out_y[index] * reciprocal; + prev_grad_x[index] += scale * grad[ty] * prev_out_y[index] * reciprocal; if (input2_height > 1) { - prev_grad_y[index] += - scale * grad[ty] * prev_out_x[index] * reciprocal; + prev_grad_y[index] += scale * grad[ty] * prev_out_x[index] * reciprocal; } else { - paddle::paddleAtomicAdd(prev_grad_y + index, - scale * grad[ty] * prev_out_x[index] * reciprocal); + paddle::paddleAtomicAdd( + prev_grad_y + index, + scale * grad[ty] * prev_out_x[index] * reciprocal); } } } else { @@ -163,17 +162,18 @@ __global__ void KeCosSimDerivative(const real* grad, real reciprocalSquareSumX = 1.0 / xx[0]; real reciprocalSquareSumY = 1.0 / yy[0]; for (int index = tid; index < width; index += block_size) { - prev_grad_x[index] += output[ty] * grad[ty] * - (prev_out_y[index] * reciprocalXY - - prev_out_x[index] * reciprocalSquareSumX); + prev_grad_x[index] += + output[ty] * grad[ty] * (prev_out_y[index] * reciprocalXY - + prev_out_x[index] * reciprocalSquareSumX); if (input2_height > 1) { - prev_grad_y[index] += output[ty] * grad[ty] * - (prev_out_x[index] * reciprocalXY - - prev_out_y[index] * reciprocalSquareSumY); + prev_grad_y[index] += + output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY - + prev_out_y[index] * reciprocalSquareSumY); } else { - paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] * - (prev_out_x[index] * reciprocalXY - - prev_out_y[index] * reciprocalSquareSumY)); + paddle::paddleAtomicAdd( + prev_grad_y + index, + output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY - + prev_out_y[index] * reciprocalSquareSumY)); } } } @@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad, const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, input1_height); - KeCosSimDerivative<<>> - (grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width, - input1_height, input2_height, scale); + KeCosSimDerivative<<>>( + grad, + output, + prev_out_x, + prev_out_y, + prev_grad_x, + prev_grad_y, + width, + input1_height, + input2_height, + scale); CHECK_SYNC("hlCossimDerivate failed"); } @@ -214,9 +222,9 @@ void CosSimBackward(const GpuMatrix& out_grad, real scale) { CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() && in2_val.getData() && in1_grad.getData() && in2_grad.getData()); - CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ - && in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_) - << "Matrix types are not equally GPU"; + CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ && + in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_) + << "Matrix types are not equally GPU"; size_t dim = in1_val.getWidth(); const real* grad = out_grad.getData(); diff --git a/paddle/function/CropOpGpu.cu b/paddle/function/CropOpGpu.cu index 786eb268d45aadee0c1f6fcbbafc23173cf0bc77..241356a9ca0b673c86ff4c39594722211e2d224e 100644 --- a/paddle/function/CropOpGpu.cu +++ b/paddle/function/CropOpGpu.cu @@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "CropOp.h" +#include "hl_base.h" namespace paddle { -__global__ void KeCrop(real* outputs, const real* inputs, - int inC, int inH, int inW, - int cropC, int cropH, int cropW, - int outC, int outH, int outW, int nthreads) { +__global__ void KeCrop(real* outputs, + const real* inputs, + int inC, + int inH, + int inW, + int cropC, + int cropH, + int cropW, + int outC, + int outH, + int outW, + int nthreads) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < nthreads) { const int w = idx % outW; @@ -35,12 +43,12 @@ __global__ void KeCrop(real* outputs, const real* inputs, template <> void Crop(real* outputs, - const real* inputs, - const TensorShape inShape, - const TensorShape outShape, - const FuncConfig& conf) { + const real* inputs, + const TensorShape inShape, + const TensorShape outShape, + const FuncConfig& conf) { std::vector crop_corner = - conf.get>("crop_corner"); + conf.get>("crop_corner"); int cropC = crop_corner[1]; int cropH = crop_corner[2]; int cropW = crop_corner[3]; @@ -58,16 +66,33 @@ void Crop(real* outputs, int blockSize = 1024; int gridSize = (nth + blockSize - 1) / blockSize; - KeCrop<<>> - (outputs, inputs, inC, inH, inW, cropC, cropH, cropW, - outC, outH, outW, nth); + KeCrop<<>>(outputs, + inputs, + inC, + inH, + inW, + cropC, + cropH, + cropW, + outC, + outH, + outW, + nth); CHECK_SYNC("Crop"); } -__global__ void KeCropDiff(const real* inGrad, real* outGrad, - int inC, int inH, int inW, - int cropC, int cropH, int cropW, - int outC, int outH, int outW, int nthreads) { +__global__ void KeCropDiff(const real* inGrad, + real* outGrad, + int inC, + int inH, + int inW, + int cropC, + int cropH, + int cropW, + int outC, + int outH, + int outW, + int nthreads) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < nthreads) { const int w = idx % inW; @@ -84,12 +109,12 @@ __global__ void KeCropDiff(const real* inGrad, real* outGrad, template <> void CropGrad(const real* inGrad, - real* outGrad, - const TensorShape inShape, - const TensorShape outShape, - const FuncConfig& conf) { + real* outGrad, + const TensorShape inShape, + const TensorShape outShape, + const FuncConfig& conf) { std::vector crop_corner = - conf.get>("crop_corner"); + conf.get>("crop_corner"); int cropC = crop_corner[1]; int cropH = crop_corner[2]; int cropW = crop_corner[3]; @@ -107,9 +132,18 @@ void CropGrad(const real* inGrad, int blockSize = 1024; int gridSize = (nth + blockSize - 1) / blockSize; - KeCropDiff <<>> - (inGrad, outGrad, inC, inH, inW, cropC, cropH, cropW, - outC, outH, outW, nth); + KeCropDiff<<>>(inGrad, + outGrad, + inC, + inH, + inW, + cropC, + cropH, + cropW, + outC, + outH, + outW, + nth); CHECK_SYNC("CropGrad"); } diff --git a/paddle/function/CrossMapNormalOpGpu.cu b/paddle/function/CrossMapNormalOpGpu.cu index b33dd108348b7789c6e73bfe3b1ffbc448163ef7..88b991ff6a1f028b333e82e2801ed2e9251aa36d 100644 --- a/paddle/function/CrossMapNormalOpGpu.cu +++ b/paddle/function/CrossMapNormalOpGpu.cu @@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "CrossMapNormalOp.h" +#include "hl_base.h" namespace paddle { -__global__ void KeCMRNormFillScale(size_t imageSize, const real* in, - real* scale, size_t channels, - size_t height, size_t width, size_t size, +__global__ void KeCMRNormFillScale(size_t imageSize, + const real* in, + real* scale, + size_t channels, + size_t height, + size_t width, + size_t size, real alpha) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < imageSize) { @@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in, } } -__global__ void KeCMRNormOutput(size_t inputSize, const real* in, - const real* scale, real negative_beta, +__global__ void KeCMRNormOutput(size_t inputSize, + const real* in, + const real* scale, + real negative_beta, real* out) { const int index = threadIdx.x + blockIdx.x * blockDim.x; if (index < inputSize) { @@ -74,24 +80,30 @@ void CrossMapNormal(real* outputs, size_t imageSize = numSamples * height * width; int blockSize = 1024; int gridSize = (imageSize + 1024 - 1) / 1024; - KeCMRNormFillScale<<>> - (imageSize, inputs, denoms, channels, height, width, size, scale); + KeCMRNormFillScale<<>>( + imageSize, inputs, denoms, channels, height, width, size, scale); - size_t inputSize = numSamples * height * width *channels; + size_t inputSize = numSamples * height * width * channels; blockSize = 1024; gridSize = (inputSize + 1024 - 1) / 1024; - KeCMRNormOutput<<>> - (inputSize, inputs, denoms, -pow, outputs); + KeCMRNormOutput<<>>( + inputSize, inputs, denoms, -pow, outputs); CHECK_SYNC("CrossMapNormal"); } -__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data, - const real* top_data, const real* scale, - const real* top_diff, size_t channels, - size_t height, size_t width, size_t size, - real negative_beta, real cache_ratio, - real* bottom_diff ) { +__global__ void KeCMRNormDiff(size_t imageSize, + const real* bottom_data, + const real* top_data, + const real* scale, + const real* top_diff, + size_t channels, + size_t height, + size_t width, + size_t size, + real negative_beta, + real cache_ratio, + real* bottom_diff) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < imageSize) { const int w = idx % width; @@ -113,17 +125,17 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data, while (index < channels + post_pad) { if (index < channels) { accum += top_diff[index * step] * top_data[index * step] / - scale[index * step]; + scale[index * step]; } if (index >= size) { accum -= top_diff[(index - size) * step] * - top_data[(index - size) * step] / scale[(index - size) * step]; + top_data[(index - size) * step] / scale[(index - size) * step]; } if (index >= post_pad) { bottom_diff[(index - post_pad) * step] += - top_diff[(index - post_pad) * step] * - pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio * - bottom_data[(index - post_pad) * step] * accum; + top_diff[(index - post_pad) * step] * + pow(scale[(index - post_pad) * step], negative_beta) - + cache_ratio * bottom_data[(index - post_pad) * step] * accum; } ++index; } @@ -147,9 +159,18 @@ void CrossMapNormalGrad(real* inputsGrad, int blockSize = 1024; int gridSize = (imageSize + 1024 - 1) / 1024; - KeCMRNormDiff <<>> - (imageSize, inputsValue, outputsValue, denoms, outputsGrad, channels, - height, width, size, -pow, 2.0f * pow * scale, inputsGrad); + KeCMRNormDiff<<>>(imageSize, + inputsValue, + outputsValue, + denoms, + outputsGrad, + channels, + height, + width, + size, + -pow, + 2.0f * pow * scale, + inputsGrad); CHECK_SYNC("CrossMapNormalGrad"); } diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/function/CrossMapNormalOpTest.cpp index ed17b17da616db9d52318f21c133458d698b0dd8..3b390db77f085aecfd65a9aa64e68ecc189163c1 100644 --- a/paddle/function/CrossMapNormalOpTest.cpp +++ b/paddle/function/CrossMapNormalOpTest.cpp @@ -18,11 +18,11 @@ limitations under the License. */ namespace paddle { TEST(CrossMapNormal, real) { - for (size_t numSamples : {5, 32}) { - for (size_t channels : {1, 5, 32}) { - for (size_t imgSizeH : {5, 33, 100}) { - for (size_t imgSizeW : {5, 32, 96}) { - for (size_t size : {1, 2, 3, 5, 7}) { + for (size_t numSamples : {5}) { + for (size_t channels : {1, 5}) { + for (size_t imgSizeH : {5, 33}) { + for (size_t imgSizeW : {5, 32}) { + for (size_t size : {1, 3}) { VLOG(3) << " numSamples=" << numSamples << " channels=" << channels << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW << " size=" << size; @@ -48,11 +48,11 @@ TEST(CrossMapNormal, real) { } TEST(CrossMapNormalGrad, real) { - for (size_t numSamples : {5, 32}) { - for (size_t channels : {1, 5, 32}) { - for (size_t imgSizeH : {5, 33, 100}) { - for (size_t imgSizeW : {5, 32, 96}) { - for (size_t size : {1, 2, 3, 5, 7}) { + for (size_t numSamples : {5}) { + for (size_t channels : {1, 5}) { + for (size_t imgSizeH : {5, 33}) { + for (size_t imgSizeW : {5, 32}) { + for (size_t size : {1, 3}) { VLOG(3) << " numSamples=" << numSamples << " channels=" << channels << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW << " size=" << size; diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu index ede0d27aa82e7d71ff5bc33df110fec260e06463..33463805cbd4746c05548028e0bc4a0e2a90453e 100644 --- a/paddle/function/DepthwiseConvOpGpu.cu +++ b/paddle/function/DepthwiseConvOpGpu.cu @@ -20,17 +20,25 @@ namespace paddle { // CUDA kernel to compute the depthwise convolution forward pass template -__global__ -void ConvolutionDepthwiseForward(const int nthreads, - const T* const inputData, const T* const filterData, - const int batchSize, const int outputChannels, const int outputHeight, - const int outputWidth, const int inputChannels, const int inputHeight, - const int inputWidth, const int filterMultiplier, const int filterHeight, - const int filterWidth, const int strideH, const int strideW, - const int paddingH, const int paddingW, T* const outputData) { - - int index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; +__global__ void ConvolutionDepthwiseForward(const int nthreads, + const T* const inputData, + const T* const filterData, + const int batchSize, + const int outputChannels, + const int outputHeight, + const int outputWidth, + const int inputChannels, + const int inputHeight, + const int inputWidth, + const int filterMultiplier, + const int filterHeight, + const int filterWidth, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + T* const outputData) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < nthreads) { const int batch = index / outputChannels / outputHeight / outputWidth; @@ -45,32 +53,36 @@ void ConvolutionDepthwiseForward(const int nthreads, const int w_in_start = -paddingW + w_out * strideW; const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1; const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1; - if ((h_in_start >= 0) && (h_in_end < inputHeight) - && (w_in_start >= 0) && (w_in_end < inputWidth)) { - for (int kh = 0; kh < filterHeight; ++kh) { - for (int kw = 0; kw < filterWidth; ++kw) { - const int h_in = -paddingH + h_out * strideH + kh; - const int w_in = -paddingW + w_out * strideW + kw; - const int offset = ((batch * inputChannels + c_in) - * inputHeight + h_in) * inputWidth + w_in; - value += (*weight) * inputData[offset]; - ++weight; - } + if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) && + (w_in_end < inputWidth)) { + for (int kh = 0; kh < filterHeight; ++kh) { + for (int kw = 0; kw < filterWidth; ++kw) { + const int h_in = -paddingH + h_out * strideH + kh; + const int w_in = -paddingW + w_out * strideW + kw; + const int offset = + ((batch * inputChannels + c_in) * inputHeight + h_in) * + inputWidth + + w_in; + value += (*weight) * inputData[offset]; + ++weight; } + } } else { - for (int kh = 0; kh < filterHeight; ++kh) { - for (int kw = 0; kw < filterWidth; ++kw) { - const int h_in = -paddingH + h_out * strideH + kh; - const int w_in = -paddingW + w_out * strideW + kw; - if ((h_in >= 0) && (h_in < inputHeight) - && (w_in >= 0) && (w_in < inputWidth)) { - const int offset = ((batch * inputChannels + c_in) - * inputHeight + h_in) * inputWidth + w_in; - value += (*weight) * inputData[offset]; - } - ++weight; - } - } + for (int kh = 0; kh < filterHeight; ++kh) { + for (int kw = 0; kw < filterWidth; ++kw) { + const int h_in = -paddingH + h_out * strideH + kh; + const int w_in = -paddingW + w_out * strideW + kw; + if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) && + (w_in < inputWidth)) { + const int offset = + ((batch * inputChannels + c_in) * inputHeight + h_in) * + inputWidth + + w_in; + value += (*weight) * inputData[offset]; + } + ++weight; + } + } } outputData[index] = value; } @@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads, // CUDA kernel to compute the depthwise convolution backprop w.r.t input. template -__global__ -void ConvolutionDepthwiseInputBackward(const int nthreads, - const T* const top_diff, const T* const weight_data, - const int num, const int outputChannels, const int outputHeight, - const int outputWidth, const int inputChannels, const int inputHeight, - const int inputWidth, const int filterMultiplier, const int filterHeight, - const int filterWidth, const int strideH, const int strideW, - const int paddingH, const int paddingW, T* const bottom_diff) { - int index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; +__global__ void ConvolutionDepthwiseInputBackward(const int nthreads, + const T* const top_diff, + const T* const weight_data, + const int num, + const int outputChannels, + const int outputHeight, + const int outputWidth, + const int inputChannels, + const int inputHeight, + const int inputWidth, + const int filterMultiplier, + const int filterHeight, + const int filterWidth, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + T* const bottom_diff) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < nthreads) { const int batch = index / inputChannels / inputHeight / inputWidth; const int c_in = (index / inputHeight / inputWidth) % inputChannels; @@ -96,65 +117,80 @@ void ConvolutionDepthwiseInputBackward(const int nthreads, const int c_out_start = c_in * filterMultiplier; - int h_out_start = (h_in - filterHeight + paddingH + strideH)/strideH; + int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH; h_out_start = 0 > h_out_start ? 0 : h_out_start; - int h_out_end = (h_in + paddingH)/strideH; - h_out_end = outputHeight - 1 < h_out_end? outputHeight - 1 : h_out_end; - int w_out_start = (w_in - filterWidth + paddingW + strideW)/strideW; + int h_out_end = (h_in + paddingH) / strideH; + h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end; + int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW; w_out_start = 0 > w_out_start ? 0 : w_out_start; - int w_out_end = (w_in + paddingW)/strideW; - w_out_end = outputWidth - 1 < w_out_end? outputWidth - 1 : w_out_end; + int w_out_end = (w_in + paddingW) / strideW; + w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end; T value = 0; - for (int c_out = c_out_start; - c_out < c_out_start + filterMultiplier; c_out ++) { - for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { - const int filter_h = h_in + paddingH - h_out * strideH; - for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { - const int filter_w = w_in + paddingW - w_out * strideW; - const int filter_offset = c_out * filterHeight * filterWidth - + filter_h * filterWidth + filter_w; - const int top_diff_offset = ((batch * outputChannels + c_out) * - outputHeight + h_out)* outputWidth + w_out; - value += top_diff[top_diff_offset] * weight_data[filter_offset]; - } + for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier; + c_out++) { + for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { + const int filter_h = h_in + paddingH - h_out * strideH; + for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { + const int filter_w = w_in + paddingW - w_out * strideW; + const int filter_offset = c_out * filterHeight * filterWidth + + filter_h * filterWidth + filter_w; + const int top_diff_offset = + ((batch * outputChannels + c_out) * outputHeight + h_out) * + outputWidth + + w_out; + value += top_diff[top_diff_offset] * weight_data[filter_offset]; } + } } bottom_diff[index] += value; - } + } } // CUDA kernel to compute the depthwise convolution backprop w.r.t filter. template -__global__ -void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads, - const T* const top_diff, const T* const inputData, - const int num, const int outputChannels, const int outputHeight, - const int outputWidth, const int inputChannels, const int inputHeight, - const int inputWidth, const int filterMultiplier, const int filterHeight, - const int filterWidth, const int strideH, const int strideW, - const int paddingH, const int paddingW, T* const buffer_data) { - int index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; +__global__ void ConvolutionDepthwiseFilterBackward(const int num_i, + const int nthreads, + const T* const top_diff, + const T* const inputData, + const int num, + const int outputChannels, + const int outputHeight, + const int outputWidth, + const int inputChannels, + const int inputHeight, + const int inputWidth, + const int filterMultiplier, + const int filterHeight, + const int filterWidth, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + T* const buffer_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < nthreads) { const int h_out = (index / outputWidth) % outputHeight; const int w_out = index % outputWidth; - const int kh = (index / filterWidth / outputHeight / outputWidth) - % filterHeight; + const int kh = + (index / filterWidth / outputHeight / outputWidth) % filterHeight; const int kw = (index / outputHeight / outputWidth) % filterWidth; const int h_in = -paddingH + h_out * strideH + kh; const int w_in = -paddingW + w_out * strideW + kw; - if ((h_in >= 0) && (h_in < inputHeight) - && (w_in >= 0) && (w_in < inputWidth)) { - const int c_out = index / - (filterHeight * filterWidth * outputHeight * outputWidth); + if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) && + (w_in < inputWidth)) { + const int c_out = + index / (filterHeight * filterWidth * outputHeight * outputWidth); const int c_in = c_out / filterMultiplier; const int batch = num_i; - const int top_offset = ((batch * outputChannels + c_out) * - outputHeight + h_out) * outputWidth + w_out; - const int bottom_offset = ((batch * inputChannels + c_in) - * inputHeight + h_in) * inputWidth + w_in; + const int top_offset = + ((batch * outputChannels + c_out) * outputHeight + h_out) * + outputWidth + + w_out; + const int bottom_offset = + ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth + + w_in; buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset]; } else { buffer_data[index] = 0; @@ -163,170 +199,169 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads, } template -class DepthwiseConvFunctor{ +class DepthwiseConvFunctor { public: void operator()(const T* inputData, - const T* filterData, - int batchSize, - int outputChannels, - int outputHeight, - int outputWidth, - int inputChannels, - int inputHeight, - int inputWidth, - int filterMultiplier, - int filterHeight, - int filterWidth, - int strideH, - int strideW, - int paddingH, - int paddingW, - T* outputData){ + const T* filterData, + int batchSize, + int outputChannels, + int outputHeight, + int outputWidth, + int inputChannels, + int inputHeight, + int inputWidth, + int filterMultiplier, + int filterHeight, + int filterWidth, + int strideH, + int strideW, + int paddingH, + int paddingW, + T* outputData) { int outputSize = batchSize * outputChannels * outputHeight * outputWidth; - size_t blocks = (outputSize + 1024 -1) / 1024; + size_t blocks = (outputSize + 1024 - 1) / 1024; size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; + size_t blockY = (blocks + 512 - 1) / 512; dim3 threads(1024, 1); dim3 grid(blockX, blockY); - ConvolutionDepthwiseForward - <<< grid, threads, 0, STREAM_DEFAULT >>>( - outputSize, - inputData, - filterData, - batchSize, - outputChannels, - outputHeight, - outputWidth, - inputChannels, - inputHeight, - inputWidth, - filterMultiplier, - filterHeight, - filterWidth, - strideH, - strideW, - paddingH, - paddingW, - outputData); - } + ConvolutionDepthwiseForward<<>>( + outputSize, + inputData, + filterData, + batchSize, + outputChannels, + outputHeight, + outputWidth, + inputChannels, + inputHeight, + inputWidth, + filterMultiplier, + filterHeight, + filterWidth, + strideH, + strideW, + paddingH, + paddingW, + outputData); + } }; template -class DepthwiseConvGradInputFunctor{ +class DepthwiseConvGradInputFunctor { public: void operator()(const T* outputGrad, - const T* filterData, - int batchSize, - int outputChannels, - int outputHeight, - int outputWidth, - int inputChannels, - int inputHeight, - int inputWidth, - int filterMultiplier, - int filterHeight, - int filterWidth, - int strideH, - int strideW, - int paddingH, - int paddingW, - T* inputGrad){ + const T* filterData, + int batchSize, + int outputChannels, + int outputHeight, + int outputWidth, + int inputChannels, + int inputHeight, + int inputWidth, + int filterMultiplier, + int filterHeight, + int filterWidth, + int strideH, + int strideW, + int paddingH, + int paddingW, + T* inputGrad) { int inputSize = batchSize * inputChannels * inputHeight * inputWidth; - size_t blocks = (inputSize + 1024 -1) / 1024; + size_t blocks = (inputSize + 1024 - 1) / 1024; size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; + size_t blockY = (blocks + 512 - 1) / 512; dim3 threads(1024, 1); dim3 grid(blockX, blockY); - ConvolutionDepthwiseInputBackward - // NOLINT_NEXT_LINE(whitespace/operators) - <<< grid, threads, 0, STREAM_DEFAULT >>>( - inputSize, - outputGrad, - filterData, - batchSize, - outputChannels, - outputHeight, - outputWidth, - inputChannels, - inputHeight, - inputWidth, - filterMultiplier, - filterHeight, - filterWidth, - strideH, - strideW, - paddingH, - paddingW, - inputGrad); - } + // NOLINT_NEXT_LINE(whitespace/operators) + <<>>(inputSize, + outputGrad, + filterData, + batchSize, + outputChannels, + outputHeight, + outputWidth, + inputChannels, + inputHeight, + inputWidth, + filterMultiplier, + filterHeight, + filterWidth, + strideH, + strideW, + paddingH, + paddingW, + inputGrad); + } }; template class DepthwiseConvGradFilterFunctor { public: void operator()(const T* outputGrad, - const T* inputData, - int batchSize, - int outputChannels, - int outputHeight, - int outputWidth, - int inputChannels, - int inputHeight, - int inputWidth, - int filterMultiplier, - int filterHeight, - int filterWidth, - int strideH, - int strideW, - int paddingH, - int paddingW, - T* colData, - T* filterGrad){ - int colDataSize = outputChannels * filterHeight * filterWidth - * outputHeight * outputWidth; + const T* inputData, + int batchSize, + int outputChannels, + int outputHeight, + int outputWidth, + int inputChannels, + int inputHeight, + int inputWidth, + int filterMultiplier, + int filterHeight, + int filterWidth, + int strideH, + int strideW, + int paddingH, + int paddingW, + T* colData, + T* filterGrad) { + int colDataSize = outputChannels * filterHeight * filterWidth * + outputHeight * outputWidth; - size_t blocks = (colDataSize + 1024 -1) / 1024; - size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; - dim3 threads(1024, 1); - dim3 grid(blockX, blockY); - BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth, - 1, filterGrad, false, true); + size_t blocks = (colDataSize + 1024 - 1) / 1024; + size_t blockX = 512; + size_t blockY = (blocks + 512 - 1) / 512; + dim3 threads(1024, 1); + dim3 grid(blockX, blockY); + BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth, + 1, + filterGrad, + false, + true); - for (int i = 0; i < batchSize; i++) { - ConvolutionDepthwiseFilterBackward - <<< grid, threads, 0, STREAM_DEFAULT >>>( - i, - colDataSize, - outputGrad, - inputData, - batchSize, - outputChannels, - outputHeight, - outputWidth, - inputChannels, - inputHeight, - inputWidth, - filterMultiplier, - filterHeight, - filterWidth, - strideH, - strideW, - paddingH, - paddingW, - colData); - int K = outputHeight * outputWidth; - int M = colDataSize / K; + for (int i = 0; i < batchSize; i++) { + ConvolutionDepthwiseFilterBackward< + T><<>>(i, + colDataSize, + outputGrad, + inputData, + batchSize, + outputChannels, + outputHeight, + outputWidth, + inputChannels, + inputHeight, + inputWidth, + filterMultiplier, + filterHeight, + filterWidth, + strideH, + strideW, + paddingH, + paddingW, + colData); + int K = outputHeight * outputWidth; + int M = colDataSize / K; - BaseMatrix colMatrix(M, K, colData, false, true); - filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0); - } + BaseMatrix colMatrix(M, K, colData, false, true); + filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0); } + } }; #ifdef PADDLE_TYPE_DOUBLE diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp index fdf7e631e5ab8c67eb5cf906bd0af49740d60112..6360a6e023ebd2f97c442c80c8d7f56b5ec4cbf7 100644 --- a/paddle/function/FunctionTest.cpp +++ b/paddle/function/FunctionTest.cpp @@ -24,14 +24,14 @@ void FunctionApi(typename Tensor::Matrix& output, template <> void FunctionApi(CpuMatrix& output, const CpuMatrix& input) { - EXPECT_EQ(output.getHeight(), 100); - EXPECT_EQ(output.getWidth(), 200); + EXPECT_EQ(output.getHeight(), 100U); + EXPECT_EQ(output.getWidth(), 200U); } template <> void FunctionApi(GpuMatrix& output, const GpuMatrix& input) { - EXPECT_EQ(output.getHeight(), 10); - EXPECT_EQ(output.getWidth(), 20); + EXPECT_EQ(output.getHeight(), 10U); + EXPECT_EQ(output.getWidth(), 20U); } template @@ -85,14 +85,14 @@ void testBufferArgs(const BufferArgs& inputs, } void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) { - EXPECT_EQ(inputs.size(), 1); + EXPECT_EQ(inputs.size(), 1U); check(inputs[0]); } TEST(Arguments, Matrix) { MatrixPtr matrix = Matrix::create(100, 200); CheckBufferArg check = [=](const BufferArg& arg) { - EXPECT_EQ(arg.shape().ndims(), 2); + EXPECT_EQ(arg.shape().ndims(), 2U); EXPECT_EQ(arg.shape()[0], 100); EXPECT_EQ(arg.shape()[1], 200); EXPECT_EQ(arg.data(), matrix->getData()); diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 9deb2739fcfff935a98a0b5b31b5d11819d81227..0ada4d70a0c7d13f9b5fb1a42eac07fc4c775a87 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -66,16 +66,23 @@ public: real* inputData = inputs[0].data(); real* filterData = inputs[1].data(); real* outputData = outputs[0].data(); + bool needIm2col = isNeedIm2col(filter); + TensorShape imShape = TensorShape({inputChannels / groups_, inputHeight, inputWidth}); - TensorShape colShape = TensorShape({inputChannels / groups_, - filterHeight, - filterWidth, - outputHeight, - outputWidth}); - resizeBuffer(colShape.getElements()); - real* colData = reinterpret_cast(memory_->getBuf()); + TensorShape colShape; + real* colData = NULL; + + if (needIm2col) { + colShape = TensorShape({inputChannels / groups_, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + resizeBuffer(colShape.getElements()); + colData = reinterpret_cast(memory_->getBuf()); + } Im2ColFunctor im2col; GemmFunctor gemm; @@ -86,15 +93,18 @@ public: for (size_t i = 0; i < batchSize; i++) { for (size_t g = 0; g < groups_; g++) { - im2col(inputData + g * inputOffset, - imShape, - colData, - colShape, - strideH(), - strideW(), - paddingH(), - paddingW()); - + if (needIm2col) { + im2col(inputData + g * inputOffset, + imShape, + colData, + colShape, + strideH(), + strideW(), + paddingH(), + paddingW()); + } else { + colData = inputData + g * inputOffset; + } int M = outputChannels / groups_; int N = outputHeight * outputWidth; int K = inputChannels / groups_ * filterHeight * filterWidth; @@ -159,19 +169,27 @@ public: real* outputGrad = inputs[0].data(); real* filterData = inputs[1].data(); real* inputGrad = outputs[0].data(); + bool needIm2col = isNeedIm2col(filter); + TensorShape imShape = TensorShape({inputChannels / groups_, inputHeight, inputWidth}); - TensorShape colShape = TensorShape({inputChannels / groups_, - filterHeight, - filterWidth, - outputHeight, - outputWidth}); - resizeBuffer(colShape.getElements()); - real* colData = reinterpret_cast(memory_->getBuf()); + TensorShape colShape; + real* colData = NULL; + + if (needIm2col) { + colShape = TensorShape({inputChannels / groups_, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + resizeBuffer(colShape.getElements()); + colData = reinterpret_cast(memory_->getBuf()); + } Col2ImFunctor col2im; GemmFunctor gemm; + size_t inputOffset = imShape.getElements(); size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; @@ -182,6 +200,11 @@ public: int K = outputChannels / groups_; int N = outputHeight * outputWidth; int M = inputChannels / groups_ * filterHeight * filterWidth; + real scale = 0.0f; + if (!needIm2col) { + colData = inputGrad + g * inputOffset; + scale = 1.0f; + } gemm(CblasTrans, CblasNoTrans, M, @@ -192,17 +215,19 @@ public: M, outputGrad + g * outputOffset, N, - 0.0f, + scale, colData, N); - col2im(inputGrad + g * inputOffset, - imShape, - colData, - colShape, - strideH(), - strideW(), - paddingH(), - paddingW()); + if (needIm2col) { + col2im(inputGrad + g * inputOffset, + imShape, + colData, + colShape, + strideH(), + strideW(), + paddingH(), + paddingW()); + } } inputGrad += inputChannels * inputHeight * inputWidth; outputGrad += outputChannels * outputHeight * outputWidth; @@ -255,16 +280,23 @@ public: real* outputGrad = inputs[0].data(); real* inputData = inputs[1].data(); real* filterGrad = outputs[0].data(); + bool needIm2col = isNeedIm2col(filter); + TensorShape imShape = TensorShape({inputChannels / groups_, inputHeight, inputWidth}); - TensorShape colShape = TensorShape({inputChannels / groups_, - filterHeight, - filterWidth, - outputHeight, - outputWidth}); - resizeBuffer(colShape.getElements()); - real* colData = reinterpret_cast(memory_->getBuf()); + TensorShape colShape; + real* colData = NULL; + + if (needIm2col) { + colShape = TensorShape({inputChannels / groups_, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + resizeBuffer(colShape.getElements()); + colData = reinterpret_cast(memory_->getBuf()); + } Im2ColFunctor im2col; GemmFunctor gemm; @@ -274,15 +306,18 @@ public: size_t filterOffset = filter.getElements() / groups_; for (size_t i = 0; i < batchSize; i++) { for (size_t g = 0; g < groups_; g++) { - im2col(inputData + g * inputOffset, - imShape, - colData, - colShape, - strideH(), - strideW(), - paddingH(), - paddingW()); - + if (needIm2col) { + im2col(inputData + g * inputOffset, + imShape, + colData, + colShape, + strideH(), + strideW(), + paddingH(), + paddingW()); + } else { + colData = inputData + g * inputOffset; + } int M = outputChannels / groups_; int K = outputHeight * outputWidth; int N = inputChannels / groups_ * filterHeight * filterWidth; diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu index 15ba854009636d027447d104071163100d5e3f4b..bd98610498b1af003574129118be4684d38e5813 100644 --- a/paddle/function/Im2ColOpGpu.cu +++ b/paddle/function/Im2ColOpGpu.cu @@ -17,16 +17,21 @@ limitations under the License. */ namespace paddle { -template -__global__ -void im2col(const T* data_im, int numOuts, int height, int width, - int blockH, int blockW, - int strideH, int strideW, - int paddingH, int paddingW, - int height_col, int width_col, - T* data_col) { - int index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; +template +__global__ void im2col(const T* data_im, + int numOuts, + int height, + int width, + int blockH, + int blockW, + int strideH, + int strideW, + int paddingH, + int paddingW, + int height_col, + int width_col, + T* data_col) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < numOuts) { int w_out = index % width_col; index /= width_col; @@ -39,17 +44,17 @@ void im2col(const T* data_im, int numOuts, int height, int width, data_col += (channel_out * height_col + h_out) * width_col + w_out; for (int i = 0; i < blockH; ++i) { for (int j = 0; j < blockW; ++j) { - int rIdx = int(h_in+i); - int cIdx = int(w_in+j); - if ((rIdx-(int)paddingH) >= (int)height || - (rIdx-(int)paddingH) < 0 || - (cIdx-(int)paddingW) >= (int)width || - (cIdx-(int)paddingW) < 0) { + int rIdx = int(h_in + i); + int cIdx = int(w_in + j); + if ((rIdx - (int)paddingH) >= (int)height || + (rIdx - (int)paddingH) < 0 || + (cIdx - (int)paddingW) >= (int)width || + (cIdx - (int)paddingW) < 0) { *data_col = 0; } else { - rIdx = rIdx + channel_in*height - paddingH; + rIdx = rIdx + channel_in * height - paddingH; cIdx = cIdx - paddingW; - *data_col = data_im[rIdx* width + cIdx]; + *data_col = data_im[rIdx * width + cIdx]; } data_col += height_col * width_col; } @@ -82,60 +87,73 @@ public: int outputWidth = colShape[4]; int numKernels = inputChannels * outputHeight * outputWidth; - int blocks = (numKernels + 1024 -1) / 1024; + int blocks = (numKernels + 1024 - 1) / 1024; int blockX = 512; int blockY = (blocks + 512 - 1) / 512; dim3 threads(1024, 1); dim3 grid(blockX, blockY); - im2col<<< grid, threads, 0, STREAM_DEFAULT >>> - (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth, - strideHeight, strideWidth, paddingHeight, paddingWidth, - outputHeight, outputWidth, colData); + im2col<<>>(imData, + numKernels, + inputHeight, + inputWidth, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth, + colData); CHECK_SYNC("Im2ColFunctor GPU failed"); } }; -template -__global__ -void col2im(size_t n, const T* data_col, size_t height, - size_t width, size_t channels, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t height_col, size_t width_col, - T* data_im) { +template +__global__ void col2im(size_t n, + const T* data_col, + size_t height, + size_t width, + size_t channels, + size_t blockH, + size_t blockW, + size_t strideH, + size_t strideW, + size_t paddingH, + size_t paddingW, + size_t height_col, + size_t width_col, + T* data_im) { size_t index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < n) { T val = 0; int w = int(index % width); int h = int((index / width) % height); int c = int(index / (width * height)); if ((w - (int)paddingW) >= 0 && - (w - (int)paddingW) < (width-2 * paddingW) && - (h - (int)paddingH) >= 0 && - (h - paddingH) < (height - 2 * paddingH)) { + (w - (int)paddingW) < (width - 2 * paddingW) && + (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) { // compute the start and end of the output int w_col_start = - (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; - int w_col_end = - min((int)(w / (int)strideW + 1), (int)(width_col)); + (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; + int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col)); int h_col_start = - (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; + (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; int h_col_end = min(int(h / strideH + 1), int(height_col)); for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { // the col location: [c * width * height + h_out, w_out] - int c_col = int(c * blockH* blockW) + \ - (h - h_col * (int)strideH) * (int)blockW + - (w - w_col * (int)strideW); + int c_col = int(c * blockH * blockW) + + (h - h_col * (int)strideH) * (int)blockW + + (w - w_col * (int)strideW); val += data_col[(c_col * height_col + h_col) * width_col + w_col]; } } h -= paddingH; w -= paddingW; - data_im[c*((width-2*paddingW) * (height-2*paddingH)) + - h*(width-2*paddingW) + w] += val; + data_im[c * ((width - 2 * paddingW) * (height - 2 * paddingH)) + + h * (width - 2 * paddingW) + w] += val; } } } @@ -164,32 +182,32 @@ public: int outputHeight = colShape[3]; int outputWidth = colShape[4]; - size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight) - * (inputWidth + 2*paddingWidth); + size_t numKernels = inputChannels * (inputHeight + 2 * paddingHeight) * + (inputWidth + 2 * paddingWidth); - size_t blocks = (numKernels + 1024 -1) / 1024; + size_t blocks = (numKernels + 1024 - 1) / 1024; size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; + size_t blockY = (blocks + 512 - 1) / 512; dim3 threads(1024, 1); dim3 grid(blockX, blockY); // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. - col2im<<< grid, threads, 0, STREAM_DEFAULT >>> - (numKernels, - colData, - inputHeight + 2*paddingHeight, - inputWidth + 2*paddingWidth, - inputChannels, - filterHeight, - filterWidth, - strideHeight, - strideWidth, - paddingHeight, - paddingWidth, - outputHeight, - outputWidth, - imData); + col2im<<>>( + numKernels, + colData, + inputHeight + 2 * paddingHeight, + inputWidth + 2 * paddingWidth, + inputChannels, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth, + imData); CHECK_SYNC("Col2ImFunctor GPU failed"); } }; @@ -199,31 +217,35 @@ template class Im2ColFunctor; template class Col2ImFunctor; template class Col2ImFunctor; -template -__global__ -void im2colOCF(const T* imData, T* colData, - int inputChannels, - int inputHeight, int inputWidth, - int filterHeight, int filterWidth, - int strideHeight, int strideWidth, - int paddingHeight, int paddingWidth, - int outputHeight, int outputWidth) { +template +__global__ void im2colOCF(const T* imData, + T* colData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth) { int swId = blockIdx.x; int shId = blockIdx.y; - for (int channelId = threadIdx.z; - channelId < inputChannels; + for (int channelId = threadIdx.z; channelId < inputChannels; channelId += blockDim.z) { for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { int widthOffset = idx + swId * strideWidth - paddingWidth; int heightOffset = idy + shId * strideHeight - paddingHeight; - int imOffset = widthOffset + heightOffset * inputWidth - + channelId * inputHeight * inputWidth; + int imOffset = widthOffset + heightOffset * inputWidth + + channelId * inputHeight * inputWidth; - int colOffset = idx + idy * filterWidth - + channelId * filterHeight * filterWidth - + (shId * outputWidth + swId) - * (inputChannels * filterHeight * filterWidth); + int colOffset = idx + idy * filterWidth + + channelId * filterHeight * filterWidth + + (shId * outputWidth + swId) * + (inputChannels * filterHeight * filterWidth); if (heightOffset >= inputHeight || heightOffset < 0 || widthOffset >= inputWidth || widthOffset < 0) { @@ -279,39 +301,52 @@ public: int blockDimZ = 1024 / blockDimX / blockDimY; dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels)); dim3 grid(outputWidth, outputHeight); - im2colOCF<<< grid, threads, 0, STREAM_DEFAULT >>> - (imData, colData, inputChannels, inputHeight, inputWidth, - filterHeight, filterWidth, strideHeight, strideWidth, - paddingHeight, paddingWidth, outputHeight, outputWidth); + im2colOCF<<>>(imData, + colData, + inputChannels, + inputHeight, + inputWidth, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth); CHECK_SYNC("Im2ColFunctor GPU failed"); } }; -template -__global__ -void col2imOCF(T* imData, const T* colData, - int inputChannels, - int inputHeight, int inputWidth, - int filterHeight, int filterWidth, - int strideHeight, int strideWidth, - int paddingHeight, int paddingWidth, - int outputHeight, int outputWidth) { +template +__global__ void col2imOCF(T* imData, + const T* colData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth) { int swId = blockIdx.x; int shId = blockIdx.y; - for (int channelId = threadIdx.z; - channelId < inputChannels; + for (int channelId = threadIdx.z; channelId < inputChannels; channelId += blockDim.z) { for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { int widthOffset = idx + swId * strideWidth - paddingWidth; int heightOffset = idy + shId * strideHeight - paddingHeight; - int imOffset = widthOffset + heightOffset * inputWidth - + channelId * inputHeight * inputWidth; + int imOffset = widthOffset + heightOffset * inputWidth + + channelId * inputHeight * inputWidth; - int colOffset = idx + idy * filterWidth - + channelId * filterHeight * filterWidth - + (shId * outputWidth + swId) - * (inputChannels * filterHeight * filterWidth); + int colOffset = idx + idy * filterWidth + + channelId * filterHeight * filterWidth + + (shId * outputWidth + swId) * + (inputChannels * filterHeight * filterWidth); if (heightOffset >= 0 && heightOffset < inputHeight && widthOffset >= 0 && widthOffset < inputWidth) { @@ -365,10 +400,19 @@ public: int blockDimZ = 1024 / blockDimX / blockDimY; dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels)); dim3 grid(outputWidth, outputHeight); - col2imOCF<<< grid, threads, 0, STREAM_DEFAULT >>> - (imData, colData, inputChannels, inputHeight, inputWidth, - filterHeight, filterWidth, strideHeight, strideWidth, - paddingHeight, paddingWidth, outputHeight, outputWidth); + col2imOCF<<>>(imData, + colData, + inputChannels, + inputHeight, + inputWidth, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth); CHECK_SYNC("Col2ImFunctor GPU failed"); } }; diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu index dcfcb2325d7dae22e0e0e78fc0bddf061fc0940c..9449b89056b4b1740cb4c3de630348b1b361d61e 100644 --- a/paddle/function/MulOpGpu.cu +++ b/paddle/function/MulOpGpu.cu @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "MulOp.h" +#include "hl_base.h" #include "paddle/math/Matrix.h" #include "paddle/math/SparseMatrix.h" diff --git a/paddle/function/PadOpGpu.cu b/paddle/function/PadOpGpu.cu index 9094f1528433fdcaad3397a991aa8ac6fa04bc01..5b6f4e6832aea4bcfe22e530f5f25ef5815729f1 100644 --- a/paddle/function/PadOpGpu.cu +++ b/paddle/function/PadOpGpu.cu @@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "PadOp.h" +#include "hl_base.h" namespace paddle { -__global__ void KePad(real* outputs, const real* inputs, - int inC, int inH, int inW, - int padc, int padh, int padw, - int outC, int outH, int outW, int nthreads) { +__global__ void KePad(real* outputs, + const real* inputs, + int inC, + int inH, + int inW, + int padc, + int padh, + int padw, + int outC, + int outH, + int outW, + int nthreads) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < nthreads) { const int w = idx % inW; @@ -50,16 +58,33 @@ void Pad(real* outputs, int outC = inC + cstart + cend; int outH = inH + hstart + hend; int outW = inW + wstart + wend; - KePad<<>> - (outputs, inputs, inC, inH, inW, cstart, hstart, wstart, - outC, outH, outW, nth); + KePad<<>>(outputs, + inputs, + inC, + inH, + inW, + cstart, + hstart, + wstart, + outC, + outH, + outW, + nth); CHECK_SYNC("Pad"); } -__global__ void KePadDiff(real* inGrad, const real* outGrad, - int inC, int inH, int inW, - int padc, int padh, int padw, - int outC, int outH, int outW, int nthreads) { +__global__ void KePadDiff(real* inGrad, + const real* outGrad, + int inC, + int inH, + int inW, + int padc, + int padh, + int padw, + int outC, + int outH, + int outW, + int nthreads) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < nthreads) { const int w = idx % inW; @@ -89,9 +114,18 @@ void PadGrad(real* inGrad, int outC = inC + cstart + cend; int outH = inH + hstart + hend; int outW = inW + wstart + wend; - KePadDiff <<>> - (inGrad, outGrad, inC, inH, inW, cstart, hstart, wstart, - outC, outH, outW, nth); + KePadDiff<<>>(inGrad, + outGrad, + inC, + inH, + inW, + cstart, + hstart, + wstart, + outC, + outH, + outW, + nth); CHECK_SYNC("PadGrad"); } diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu index d9dcc7d59d1e3c222f5a7ce448daa8d7edb6c978..b0cbd9fd1df9a35d6cc1cb5312099d8b45197944 100644 --- a/paddle/function/RowConvOpGpu.cu +++ b/paddle/function/RowConvOpGpu.cu @@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "RowConvOp.h" +#include "hl_base.h" namespace paddle { -template -__global__ void KeRowConv(real* y, const real* x, const real* w, - const int* starts, const int height, const int width, - const int numSeq, const int context) { - +template +__global__ void KeRowConv(real* y, + const real* x, + const real* w, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -30,7 +34,7 @@ __global__ void KeRowConv(real* y, const real* x, const real* w, __shared__ real sw[BLOCK_H][BLOCK_W]; for (int i = tidy; i < context; i += blky) { - sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0; + sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0; } __syncthreads(); @@ -56,9 +60,14 @@ __global__ void KeRowConv(real* y, const real* x, const real* w, } } -__global__ void KeRowConv2(real* y, const real* x, const real* w, - const int* starts, const int height, const int width, - const int numSeq, const int context) { +__global__ void KeRowConv2(real* y, + const real* x, + const real* w, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -84,8 +93,6 @@ __global__ void KeRowConv2(real* y, const real* x, const real* w, } } - - template <> void RowConv(GpuMatrix& out, const GpuMatrix& in, @@ -105,21 +112,24 @@ void RowConv(GpuMatrix& out, dim3 dimGrid(DIVUP(width, dimBlock.x), 1); if (contextLength <= 32) { - KeRowConv<32, 32><<>> - (y, x, w, starts, height, width, numSeq, contextLength); + KeRowConv<32, 32><<>>( + y, x, w, starts, height, width, numSeq, contextLength); } else { - KeRowConv2<<>> - (y, x, w, starts, height, width, numSeq, contextLength); + KeRowConv2<<>>( + y, x, w, starts, height, width, numSeq, contextLength); } CHECK_SYNC("RowConv"); } - -template -__global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, - const int* starts, const int height, const int width, const int numSeq, - const int context) { - +template +__global__ void KeRowConvBwWeight(real* dw, + const real* x, + const real* dy, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -138,21 +148,21 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, const int start = starts[i]; const int end = starts[i + 1]; const int steps = end - start; - const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H; + const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H; for (int j = tidy; j < size; j += BLOCK_H) { int xoff = gidx + tidx; int yoff = start + j; // transpose - sh_x[tidx][tidy] = (xoff < width && yoff < end) ? - x[yoff * width + xoff] : 0.0; - sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? - dy[yoff * width + xoff] : 0.0; + sh_x[tidx][tidy] = + (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0; + sh_dy[tidx][tidy + context - 1] = + (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0; __syncthreads(); if (tidy < (context - 1)) { yoff = yoff - context + 1; - sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? - dy[yoff * width + xoff] : 0.0; + sh_dy[tidx][tidy] = + (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0; } __syncthreads(); @@ -179,11 +189,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, } } -template -__global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, - const int* starts, const int height, const int width, const int numSeq, - const int context) { - +template +__global__ void KeRowConvBwWeight2(real* dw, + const real* x, + const real* dy, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int gidx = blockIdx.x * blockDim.x; @@ -196,19 +210,21 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, const int end = starts[i + 1]; const int steps = end - start; - const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H; + const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H; for (int j = tidy; j < size; j += BLOCK_H) { int xoff = gidx + tidx; int yoff = start + j; // transpose - sh_x[tidx][tidy] = (xoff < width && yoff < end) ? - x[yoff * width + xoff] : 0.0; + sh_x[tidx][tidy] = + (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0; __syncthreads(); for (int t = 0; t < context; t++) { - sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && - yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0; + sh_dy[tidx][tidy] = + (xoff < width && (yoff - t) >= start && yoff - t < end) + ? dy[(yoff - t) * width + xoff] + : 0.0; __syncthreads(); real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx]; @@ -222,18 +238,22 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, __syncthreads(); if (tidx == 0 && (gidx + tidy) < width) { - dw[t*width + gidx + tidy] += val; + dw[t * width + gidx + tidy] += val; } } } } } -template -__global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, - const int* starts, const int height, const int width, const int numSeq, - const int context) { - +template +__global__ void KeRowConvBwData(real* dx, + const real* w, + const real* dy, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -242,7 +262,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, __shared__ real sw[BLOCK_H][BLOCK_W]; for (int i = tidy; i < context; i += blky) { - sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0; + sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0; } __syncthreads(); @@ -266,10 +286,14 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, } } -__global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy, - const int* starts, const int height, const int width, const int numSeq, - const int context) { - +__global__ void KeRowConvBwData2(real* dx, + const real* w, + const real* dy, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -295,14 +319,13 @@ __global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy, } } - template <> void RowConvGrad(const GpuMatrix& outG, - const GpuMatrix& in, - const GpuMatrix& filter, - GpuMatrix& inG, - GpuMatrix& filterG, - const GpuIVector& seq) { + const GpuMatrix& in, + const GpuMatrix& filter, + GpuMatrix& inG, + GpuMatrix& filterG, + const GpuIVector& seq) { const size_t numSeq = seq.getSize() - 1; const size_t contextLength = filter.getHeight(); const size_t height = in.getHeight(); @@ -318,13 +341,11 @@ void RowConvGrad(const GpuMatrix& outG, dim3 dimGrid(DIVUP(width, dimBlock.x), 1); real* dw = filterG.getData(); if (contextLength <= 32) { - KeRowConvBwWeight<32, 32, 32> - <<>> - (dw, x, dy, starts, height, width, numSeq, contextLength); + KeRowConvBwWeight<32, 32, 32><<>>( + dw, x, dy, starts, height, width, numSeq, contextLength); } else { - KeRowConvBwWeight2<32, 32> - <<>> - (dw, x, dy, starts, height, width, numSeq, contextLength); + KeRowConvBwWeight2<32, 32><<>>( + dw, x, dy, starts, height, width, numSeq, contextLength); } } @@ -333,13 +354,11 @@ void RowConvGrad(const GpuMatrix& outG, dim3 dimBlock2(32, 32); dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1); if (contextLength <= 64) { - KeRowConvBwData<32, 64> - <<>> - (dx, w, dy, starts, height, width, numSeq, contextLength); + KeRowConvBwData<32, 64><<>>( + dx, w, dy, starts, height, width, numSeq, contextLength); } else { - KeRowConvBwData2 - <<>> - (dx, w, dy, starts, height, width, numSeq, contextLength); + KeRowConvBwData2<<>>( + dx, w, dy, starts, height, width, numSeq, contextLength); } } diff --git a/paddle/function/TensorShapeTest.cpp b/paddle/function/TensorShapeTest.cpp index 45a2e106e7fc3f0e9e57cf8c2bb549d747f4f49b..e5c698237706e7210d3045bbfd0088af58db2954 100644 --- a/paddle/function/TensorShapeTest.cpp +++ b/paddle/function/TensorShapeTest.cpp @@ -19,35 +19,35 @@ namespace paddle { TEST(TensorShape, Constructor) { TensorShape t1; - EXPECT_EQ(t1.ndims(), 0); - EXPECT_EQ(t1.getElements(), 0); + EXPECT_EQ(t1.ndims(), 0U); + EXPECT_EQ(t1.getElements(), 0U); TensorShape t2(3); - EXPECT_EQ(t2.ndims(), 3); - EXPECT_EQ(t2.getElements(), 1); + EXPECT_EQ(t2.ndims(), 3U); + EXPECT_EQ(t2.getElements(), 1U); TensorShape t3({8, 10}); - EXPECT_EQ(t3.ndims(), 2); - EXPECT_EQ(t3.getElements(), 80); + EXPECT_EQ(t3.ndims(), 2U); + EXPECT_EQ(t3.getElements(), 80U); TensorShape t4(t3); EXPECT_EQ(t4.ndims(), t3.ndims()); EXPECT_EQ(t4.getElements(), t3.getElements()); TensorShape t5({1, 2, 3, 4, 5}); - EXPECT_EQ(t5.ndims(), 5); - EXPECT_EQ(t5.getElements(), 120); + EXPECT_EQ(t5.ndims(), 5U); + EXPECT_EQ(t5.getElements(), 120U); } TEST(TensorShape, GetAndSet) { TensorShape t({1, 2, 3}); - EXPECT_EQ(t.ndims(), 3); - EXPECT_EQ(t.getElements(), 6); + EXPECT_EQ(t.ndims(), 3U); + EXPECT_EQ(t.getElements(), 6U); EXPECT_EQ(t[1], 2); t.setDim(1, 100); - EXPECT_EQ(t.getElements(), 300); - EXPECT_EQ(t[1], 100); + EXPECT_EQ(t.getElements(), 300U); + EXPECT_EQ(t[1], 100U); } } // namespace paddle diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/function/TensorTypeTest.cpp index e50e46f3e99111731d9587f3e4ddfd4b26ae27e9..d1c559a91e294853fa6e19f9115bc008ae56915c 100644 --- a/paddle/function/TensorTypeTest.cpp +++ b/paddle/function/TensorTypeTest.cpp @@ -19,9 +19,9 @@ namespace paddle { TEST(TensorType, Matrix) { Tensor::Matrix matrix(100, 200); - EXPECT_EQ(matrix.getHeight(), 100); - EXPECT_EQ(matrix.getWidth(), 200); - EXPECT_EQ(matrix.getElementCnt(), 100 * 200); + EXPECT_EQ(matrix.getHeight(), 100U); + EXPECT_EQ(matrix.getWidth(), 200U); + EXPECT_EQ(matrix.getElementCnt(), 100U * 200U); EXPECT_EQ(matrix.useGpu(), false); Tensor::Matrix testGpu(100, 200); @@ -33,15 +33,15 @@ TEST(TensorType, Vector) { Tensor::Vector gpuVector(100); EXPECT_EQ(cpuVector.useGpu(), false); EXPECT_EQ(gpuVector.useGpu(), true); - EXPECT_EQ(cpuVector.getSize(), 100); - EXPECT_EQ(gpuVector.getSize(), 100); + EXPECT_EQ(cpuVector.getSize(), 100U); + EXPECT_EQ(gpuVector.getSize(), 100U); Tensor::Vector cpuIVector(100); Tensor::Vector gpuIVector(100); EXPECT_EQ(cpuIVector.useGpu(), false); EXPECT_EQ(gpuIVector.useGpu(), true); - EXPECT_EQ(cpuIVector.getSize(), 100); - EXPECT_EQ(gpuIVector.getSize(), 100); + EXPECT_EQ(cpuIVector.getSize(), 100U); + EXPECT_EQ(gpuIVector.getSize(), 100U); } TEST(TensorType, EmptyMatrix) { diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp index f0ec77a5d00333993427fb8d0bc938c884e50c95..00d048eb216baf37c875c870a31cfd55a97f2974 100644 --- a/paddle/function/nnpack/NNPACKConvOp.cpp +++ b/paddle/function/nnpack/NNPACKConvOp.cpp @@ -49,9 +49,7 @@ class NNPACKConvFunction : public ConvFunctionBase { public: void init(const FuncConfig& config) override { ConvFunctionBase::init(config); - CHECK_EQ(groups_, (size_t)1); algorithm_ = get_nnp_convolution_algorithm(config.get("algo")); - // algorithm_ = nnp_convolution_algorithm_auto; transform_strategy_ = nnp_convolution_transform_strategy_compute; nnp_status status = nnp_initialize(); CHECK_EQ(status, nnp_status_success); @@ -67,8 +65,7 @@ public: } } - virtual void check(const BufferArgs& inputs, - const BufferArgs& outputs) override { + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { const TensorShape& input = inputs[0].shape(); const TensorShape& filter = inputs[1].shape(); const TensorShape& output = outputs[0].shape(); @@ -91,8 +88,8 @@ public: size_t filterHeight = getFilterHeight(filter); size_t filterWidth = getFilterWidth(filter); size_t outputChannels = output[1]; - // size_t outputHeight = output[2]; - // size_t outputWidth = output[3]; + size_t outputHeight = output[2]; + size_t outputWidth = output[3]; nnp_size inputSize = {.width = inputWidth, .height = inputHeight}; nnp_padding padding = {.top = (size_t)paddingH(), @@ -171,49 +168,58 @@ public: } } + size_t inputOffset = inputChannels / groups_ * inputHeight * inputWidth; + size_t outputOffset = outputChannels / groups_ * outputHeight * outputWidth; + size_t filterOffset = filter.getElements() / groups_; + if (batchSize == 1) { - nnp_status status = - nnp_convolution_inference(algorithm_, - transform_strategy_, - inputChannels, - outputChannels, - inputSize, - padding, - kernelSize, - outputSubsampling, - inputData, - filterData, - nullptr, /* bias */ - outputData, - bufferPtr, - sizePtr, - nnp_activation_identity, - nullptr, - threadpool_, /* threadpool */ - nullptr); - CHECK_EQ(status, nnp_status_success); + for (size_t g = 0; g < groups_; g++) { + nnp_status status = + nnp_convolution_inference(algorithm_, + transform_strategy_, + inputChannels / groups_, + outputChannels / groups_, + inputSize, + padding, + kernelSize, + outputSubsampling, + inputData + inputOffset * g, + filterData + filterOffset * g, + nullptr, /* bias */ + outputData + outputOffset * g, + bufferPtr, + sizePtr, + nnp_activation_identity, + nullptr, + threadpool_, /* threadpool */ + nullptr); + CHECK_EQ(status, nnp_status_success); + } } else { - // only supports stride = 1 - CHECK_EQ(strideH(), 1); - CHECK_EQ(strideW(), 1); - nnp_status status = nnp_convolution_output(algorithm_, - batchSize, - inputChannels, - outputChannels, - inputSize, - padding, - kernelSize, - inputData, - filterData, - nullptr, /* bias */ - outputData, - bufferPtr, - sizePtr, - nnp_activation_identity, - nullptr, - threadpool_, /* threadpool */ - nullptr); - CHECK_EQ(status, nnp_status_success); + for (size_t g = 0; g < groups_; g++) { + // only supports stride = 1 + CHECK_EQ(strideH(), 1); + CHECK_EQ(strideW(), 1); + nnp_status status = + nnp_convolution_output(algorithm_, + batchSize, + inputChannels / groups_, + outputChannels / groups_, + inputSize, + padding, + kernelSize, + inputData + inputOffset * g, + filterData + filterOffset * g, + nullptr, /* bias */ + outputData + outputOffset * g, + bufferPtr, + sizePtr, + nnp_activation_identity, + nullptr, + threadpool_, /* threadpool */ + nullptr); + CHECK_EQ(status, nnp_status_success); + } } } diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp index 81cc3c890b6d4ad048e4edc03208c85778244078..5de2170877ed6f6c70c5617918ad2c4e3b3ed2ee 100644 --- a/paddle/gserver/activations/ActivationFunction.cpp +++ b/paddle/gserver/activations/ActivationFunction.cpp @@ -186,7 +186,10 @@ Error __must_check forward(Argument& act) { useGpu(act.deviceId)); } - auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId)); + auto starts = + act.hasSubseq() + ? act.subSequenceStartPositions->getVector(useGpu(act.deviceId)) + : act.sequenceStartPositions->getVector(useGpu(act.deviceId)); act.value->sequenceSoftmax(*act.value, *starts); return Error(); } @@ -197,8 +200,9 @@ Error __must_check backward(Argument& act) { "Input width for each timestep of sequence softmax should be 1"); } - size_t numSequences = act.getNumSequences(); - const int* starts = act.sequenceStartPositions->getData(false); + size_t numSequences = + act.hasSubseq() ? act.getNumSubSequences() : act.getNumSequences(); + const int* starts = act.getCpuStartPositions(); for (size_t i = 0; i < numSequences; ++i) { // TODO(Dangqingqing) optimization for GPU diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp index 9ddd449de7500f5682d59469328f06971c6e83bf..f98bf95064fa539b990309dfe0bff10c1e99d096 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp @@ -967,8 +967,9 @@ void RecurrentGradientMachine::generateSequence() { size_t numSequences = getGenBatchSize(); resizeBootFrame(numSequences); - // We create only two sub-network in generation for alternate use. - // Thus, we can reduce total memory of output_ in layer forward. + // We create only two sub-network in generation, one stores states of all + // layers in previous time step and the other storing the states at current + // time step. resizeOrCreateFrames(2); // outFrameLines_.size() > 1UL @@ -1001,10 +1002,9 @@ void RecurrentGradientMachine::generateSequence() { // init outArg size_t resultNum = generator_.config.num_results_per_sample(); - IVector::resizeOrCreate( - generator_.outArg.ids, - generator_.config.max_num_frames() * numSequences * resultNum, - false); + size_t maxGenWordCount = + generator_.config.max_num_frames() * numSequences * resultNum; + IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false); if (resultNum > 1) { CHECK_LE(resultNum, static_cast(generator_.config.beam_size())); Matrix::resizeOrCreate(generator_.outArg.in, @@ -1012,6 +1012,11 @@ void RecurrentGradientMachine::generateSequence() { /* width */ resultNum, false, /* useGpu */ false); + Matrix::resizeOrCreate(generator_.outArg.value, + /* height */ maxGenWordCount, + /* width */ 1, + false, + /* useGpu */ false); } ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions, numSequences + 1, @@ -1313,13 +1318,20 @@ void RecurrentGradientMachine::fillGenOutputs() { starts[0] = 0; if (numResults > 1) { real* probs = generator_.outArg.in->getData(); + real* idsProb = generator_.outArg.value->getData(); + size_t curPos = 0; for (size_t i = 0; i < finalPaths_.size(); ++i) { for (size_t j = 0; j < finalPaths_[i].size(); ++j) { Path& path = finalPaths_[i][j]; - generator_.ids.push_back(path.ids.size()); // sequence size + size_t genLen = path.ids.size(); + generator_.ids.push_back(genLen); // sequence size generator_.ids.insert( generator_.ids.end(), path.ids.begin(), path.ids.end()); generator_.ids.push_back(-1); // end of sequence + + memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen); + curPos += genLen; + idsProb[curPos++] = -1.0; probs[i * numResults + j] = path.logProb; if (!j && dataArgsSize_) { diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h index f245620cf668bb341df99cf498105cbd996a6b24..fb3fc5877ac96323e891f800db80af83b6809831 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h @@ -189,6 +189,11 @@ public: */ std::vector ids; + /** + * @brief idsProb, log probability of each generated words. + */ + std::vector idsProb; + /** * @brief logProb, current probability of path. */ @@ -228,11 +233,13 @@ public: */ Path(Path& old, int newId, real logProb, int machineId, int topIndex) : ids(old.ids), + idsProb(old.idsProb), logProb(old.logProb + logProb), machineId(machineId), topIndex(topIndex), seqId(old.seqId) { ids.push_back(newId); + idsProb.push_back(logProb); if (!old.probHistory.empty()) { this->probHistory = old.probHistory; // probHistory store current prob, not sum @@ -411,8 +418,9 @@ protected: struct Generator { GeneratorConfig config; - std::vector ids; // store generated sequences - Argument outArg; // final output argument + std::vector ids; // store generated sequences + std::vector idsProb; // log probability of each generated word + Argument outArg; // final output argument }; bool generating_; Generator generator_; diff --git a/paddle/gserver/layers/ClipLayer.cpp b/paddle/gserver/layers/ClipLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..13f16c953793b82183237188b56eb61d76ecd2fd --- /dev/null +++ b/paddle/gserver/layers/ClipLayer.cpp @@ -0,0 +1,79 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Layer.h" + +namespace paddle { + +/** + * A layer for clipping the input value by the threshold. + * \f[ + * out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right) + * \f] + */ + +class ClipLayer : public Layer { +protected: + double min_; + double max_; + +public: + explicit ClipLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + void backward(const UpdateCallback& callback = nullptr) override; +}; + +REGISTER_LAYER(clip, ClipLayer); + +bool ClipLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + + CHECK_EQ(inputLayers_.size(), 1U); + auto layerConf = config_.inputs(0).clip_conf(); + min_ = layerConf.min(); + max_ = layerConf.max(); + CHECK_LT(min_, max_); + return true; +} + +void ClipLayer::forward(PassType passType) { + Layer::forward(passType); + + MatrixPtr inV = getInputValue(0); + resetOutput(inV->getHeight(), inV->getWidth()); + MatrixPtr outV = getOutputValue(); + outV->copyFrom(*inV); + outV->clip(min_, max_); +} + +void ClipLayer::backward(const UpdateCallback& callback) { + MatrixPtr inV = getInputValue(0); + MatrixPtr inG = getInputGrad(0); + if (inG) { + MatrixPtr outV = getOutputValue(); + MatrixPtr outG = getOutputGrad(); + MatrixPtr tmpMtx; + Matrix::resizeOrCreate( + tmpMtx, outG->getHeight(), outG->getWidth(), false, useGpu_); + tmpMtx->clipDerivative(*inV, min_, max_); + inG->addDotMul(*outG, *tmpMtx, 1, 1); + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp index 09dac05a7ad7a80bd6b9e12e8f7f060310d516c8..44ba2c4b7d1562d2ce839b5f4b4de1af35e6925f 100644 --- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp +++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp @@ -14,6 +14,7 @@ limitations under the License. */ #include "CudnnBatchNormLayer.h" #include "Layer.h" +#include "paddle/cuda/include/hl_batch_norm.h" #include "paddle/utils/Stat.h" namespace paddle { @@ -79,16 +80,33 @@ void CudnnBatchNormLayer::forward(PassType passType) { savedInvVar); } else { // used movingMean and movingVar in testing - hl_batch_norm_forward_inference(ioDesc_, - input, - ioDesc_, - output, - bnParamDesc_, - gamma, - beta, - movingMean, - movingVar, - EPS); + if (batchSize <= 1024) { + hl_batch_norm_forward_inference(ioDesc_, + input, + ioDesc_, + output, + bnParamDesc_, + gamma, + beta, + movingMean, + movingVar, + EPS); + } else { + // There is a limitation in cudnn library. + // When the batch size is larger than 1024 in cuDNN v5.1, + // the cudnnBatchNormalizationForwardInference will fail. + hl_batch_norm_cuda_inference(input, + output, + gamma, + beta, + movingMean, + movingVar, + EPS, + batchSize, + channels_, + imageH_, + imageW_); + } } /* activation */ { diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp index 783e02e47cb91e28eb88b079f1e94439d34fa775..0ece2799318ea5ecc91f97f71289d4d07246dcaa 100644 --- a/paddle/gserver/layers/ExpandConvLayer.cpp +++ b/paddle/gserver/layers/ExpandConvLayer.cpp @@ -57,8 +57,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, convGradFilterType = "GemmConvGradFilter"; } - if (FLAGS_use_nnpack) { - CHECK_EQ(isDeconv_, false); + if (FLAGS_use_nnpack && !isDeconv_) { createFunction(forward_, "NNPACKConv", FuncConfig() diff --git a/paddle/gserver/layers/GruCompute.cu b/paddle/gserver/layers/GruCompute.cu index d5e547dce347c824f959425551afea66dfd94e5a..b4f5c54b14767586cb7b7e2c86cc069e2063ccfd 100644 --- a/paddle/gserver/layers/GruCompute.cu +++ b/paddle/gserver/layers/GruCompute.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "GruCompute.h" #include "hl_recurrent_apply.cuh" @@ -31,8 +30,10 @@ void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) { } template <> -void GruCompute::backward<1>(hl_gru_value value, hl_gru_grad grad, - int frameSize, int batchSize) { +void GruCompute::backward<1>(hl_gru_value value, + hl_gru_grad grad, + int frameSize, + int batchSize) { hl_gpu_gru_backward(hppl::backward::gru_stateGrad(), hppl::backward::gru_resetGrad(), value, diff --git a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8ce591d4762466e1ed4b2970cb9cae9203bc0a2b --- /dev/null +++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp @@ -0,0 +1,117 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Layer.h" + +namespace paddle { + +class KmaxSeqScoreLayer : public Layer { +private: + MatrixPtr scores_; + size_t beamSize_; + void kmaxScorePerSeq(const real* score, + real* sortedRes, + const ICpuGpuVectorPtr seqStartPos); + +public: + explicit KmaxSeqScoreLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + void backward(const UpdateCallback& callback = nullptr) override; +}; + +REGISTER_LAYER(kmax_seq_score, KmaxSeqScoreLayer); + +bool KmaxSeqScoreLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + bool ret = Layer::init(layerMap, parameterMap); + CHECK_EQ(1U, inputLayers_.size()); + + beamSize_ = config_.beam_size(); + CHECK_GE(beamSize_, 1U); + + setNeedSequenceInfo(false); + setNeedGradient(false); + return ret; +} + +void KmaxSeqScoreLayer::kmaxScorePerSeq(const real* scores, + real* sortedIds, + const ICpuGpuVectorPtr seqStartPos) { + int* starts = seqStartPos->getMutableData(false); + std::vector indices; + for (size_t i = 0; i < seqStartPos->getSize() - 1; ++i) { + int seqLen = starts[i + 1] - starts[i]; + int k = std::min(static_cast(beamSize_), seqLen); + + indices.resize(seqLen, 0); + std::iota(begin(indices), end(indices), 0.); + std::vector tmpScore(scores + starts[i], scores + starts[i + 1]); + std::partial_sort( + begin(indices), + begin(indices) + k, + end(indices), + [&](size_t a, size_t b) { return tmpScore[a] > tmpScore[b]; }); + memcpy(sortedIds + (i * beamSize_), indices.data(), k * sizeof(real)); + } +} + +void KmaxSeqScoreLayer::forward(PassType passType) { + Layer::forward(passType); + + const Argument& input = getInput(0); + const MatrixPtr inputScore = getInputValue(0); + + CHECK(input.hasSeq() || input.hasSubseq()) + << "input of " << getName() + << " must be a sequence or a nested sequence."; + CHECK_EQ(input.value->getWidth(), 1UL) + << "input of " << getName() + << " is score over a sequence or a nested sequence, so its width " + << " must be 1."; + + if (useGpu_) { + // this Layer runs only in CPU, if the model is runing on GPU, + // then copy the input to this layer from GPU to CPU. + Matrix::resizeOrCreate(scores_, + inputScore->getHeight(), + 1, + false /* trans */, + false /* useGpu */); + scores_->copyFrom(*inputScore); + } else { + scores_ = inputScore; + } + + Matrix::resizeOrCreate( + output_.value, + input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(), + beamSize_, + false, + false); + output_.value->one(); + output_.value->mulScalar(-1.); + + kmaxScorePerSeq(scores_->getData(), + output_.value->getData(), + input.hasSubseq() ? input.subSequenceStartPositions + : input.sequenceStartPositions); +} + +void KmaxSeqScoreLayer::backward(const UpdateCallback& callback) {} + +} // namespace paddle diff --git a/paddle/gserver/layers/LstmCompute.cu b/paddle/gserver/layers/LstmCompute.cu index f75c0c40ccc833e35f8fe8f21c12b3d3f68d5eb6..d3f59b52a4b3163f47a969d9a08ecd139a099e33 100644 --- a/paddle/gserver/layers/LstmCompute.cu +++ b/paddle/gserver/layers/LstmCompute.cu @@ -12,41 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "LstmCompute.h" #include "hl_recurrent_apply.cuh" namespace paddle { template <> -void LstmCompute::forwardBatch<1>(hl_lstm_value value, int frameSize, - int batchSize) { - hl_gpu_lstm_forward(hppl::forward::lstm(), value, frameSize, - batchSize, activeNode_, activeGate_, +void LstmCompute::forwardBatch<1>(hl_lstm_value value, + int frameSize, + int batchSize) { + hl_gpu_lstm_forward(hppl::forward::lstm(), + value, + frameSize, + batchSize, + activeNode_, + activeGate_, activeState_); } template <> -void LstmCompute::backwardBatch<1>(hl_lstm_value value, hl_lstm_grad grad, - int frameSize, int batchSize) { - hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad, - frameSize, batchSize, activeNode_, - activeGate_, activeState_); +void LstmCompute::backwardBatch<1>(hl_lstm_value value, + hl_lstm_grad grad, + int frameSize, + int batchSize) { + hl_gpu_lstm_backward(hppl::backward::lstm(), + value, + grad, + frameSize, + batchSize, + activeNode_, + activeGate_, + activeState_); } template <> void LstmCompute::forwardOneSequence<1>(hl_lstm_value value, int frameSize) { - hl_gpu_lstm_forward(hppl::forward::lstm(), value, - frameSize, /* batchSize */ 1, - activeNode_, activeGate_, activeState_); + hl_gpu_lstm_forward(hppl::forward::lstm(), + value, + frameSize, + /* batchSize */ 1, + activeNode_, + activeGate_, + activeState_); } template <> -void LstmCompute::backwardOneSequence<1>(hl_lstm_value value, hl_lstm_grad grad, +void LstmCompute::backwardOneSequence<1>(hl_lstm_value value, + hl_lstm_grad grad, int frameSize) { - hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad, - frameSize, /* batchSize */ 1, - activeNode_, activeGate_, activeState_); + hl_gpu_lstm_backward(hppl::backward::lstm(), + value, + grad, + frameSize, + /* batchSize */ 1, + activeNode_, + activeGate_, + activeState_); } } // namespace paddle diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp index a97fa6bf78fce27a4e0cf329bf3309ba4a439965..0a1e17b9aa57b373f0df6e079341729539f4e193 100644 --- a/paddle/gserver/layers/PrintLayer.cpp +++ b/paddle/gserver/layers/PrintLayer.cpp @@ -29,7 +29,7 @@ public: vals.push_back(s.str()); } size_t pos = 0; - int i = 0; + size_t i = 0; std::ostringstream s; const std::string& format = config_.user_arg(); while (true) { diff --git a/paddle/gserver/layers/RowL2NormLayer.cpp b/paddle/gserver/layers/RowL2NormLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0d609be43b73a86d0d0f7b60be993836e2ea6fff --- /dev/null +++ b/paddle/gserver/layers/RowL2NormLayer.cpp @@ -0,0 +1,98 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Layer.h" + +namespace paddle { + +/** + * A layer for L2 normalization in each row, + * \f[ + * out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}} + * \f] + * where the size of \f$in\f$ is (batchSize x dataDim), + * and the size of \f$out\f$ is (batchSize x dataDim). + */ + +class RowL2NormLayer : public Layer { +protected: + MatrixPtr inSquare_; + MatrixPtr l2NormReciprocal_; + MatrixPtr dotSum_; + +public: + explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + void backward(const UpdateCallback& callback = nullptr) override; +}; + +REGISTER_LAYER(row_l2_norm, RowL2NormLayer); + +bool RowL2NormLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + + CHECK_EQ(inputLayers_.size(), 1U); + + return true; +} + +void RowL2NormLayer::forward(PassType passType) { + Layer::forward(passType); + + MatrixPtr inV = getInputValue(0); + + /* malloc memory for the output_ if necessary */ + size_t batchSize = inV->getHeight(); + size_t dataDim = getSize(); + CHECK_EQ(dataDim, inV->getWidth()); + resetOutput(batchSize, dataDim); + MatrixPtr outV = getOutputValue(); + + Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_); + inV->square2(*inSquare_); + Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_); + inSquare_->rowSum(*l2NormReciprocal_); + l2NormReciprocal_->sqrt2(*l2NormReciprocal_); + l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0); + outV->rowScale(0, *inV, *l2NormReciprocal_); +} + +void RowL2NormLayer::backward(const UpdateCallback& callback) { + MatrixPtr inV = getInputValue(0); + MatrixPtr inG = getInputGrad(0); + MatrixPtr outV = getOutputValue(); + MatrixPtr outG = getOutputGrad(); + size_t batchSize = inV->getHeight(); + + // inG[ij] += outG[ij] / l2NormReciprocal + // inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i], + // inV[i]) + if (inG) { + Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_); + dotSum_->zeroMem(); + dotSum_->rowDotMul(0, *outG, *outV); + dotSum_->dotMul(*dotSum_, *l2NormReciprocal_); + dotSum_->dotMul(*dotSum_, *l2NormReciprocal_); + inSquare_->rowScale(0, *inV, *dotSum_); + inG->sub(*inSquare_); + inG->addRowScale(0, *outG, *l2NormReciprocal_); + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/gserver/layers/SubNestedSequenceLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..76f587fff760d9eb9c2a8eeed53abf4d42e90834 --- /dev/null +++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp @@ -0,0 +1,176 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Layer.h" +#include "paddle/math/Matrix.h" +#include "paddle/math/Vector.h" +#include "paddle/utils/Logging.h" +#include "paddle/utils/Stat.h" + +namespace paddle { + +class SubNestedSequenceLayer : public Layer { +public: + explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + void backward(const UpdateCallback& callback = nullptr) override; + +private: + /* + * This functions generates the indices of rows in a batch according to the + * indices of selected sub-sequence in each sequence. + * + * Examples: + * selectedIndices: + * [ + * [0, 1, -1], + * [0, 1, 2], + * [0, -1, -1], + * [0, 2, 3], + * ] + * inputSeqInfo: + * [ + * [0,3,4], + * [4,5,7,10,15], + * [15,20], + * [20,22,23,25,28] + * ] + * + * ths output is saved to private member rowIndice_; + * [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, + * 16,17,18,19,20,21,22,23,24,25,26,27] + */ + + void calSelectedCols(const MatrixPtr selectedIndices, + const std::vector>& inputSeqInfo); + + // if the second input of this layer is on GPU memory, copy it to CPU memory. + MatrixPtr selIdsCpu_; + + // reorganized sequenceStartPositions and subSequenceStartPositions + // into a 2d vector to facilitate the sequence selection process. + std::vector> inputSeqInfoVec_; + + // the final selected row indices in a batch, + // rowIdx_ and selectedRows_ actually share a same memory. + IVectorPtr rowIndice_; + std::vector selectedRows_; +}; + +REGISTER_LAYER(sub_nested_seq, SubNestedSequenceLayer); + +bool SubNestedSequenceLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + /* Initialize the basic parent class */ + Layer::init(layerMap, parameterMap); + CHECK_EQ(2U, inputLayers_.size()); + setNeedSequenceInfo(false); + return true; +} + +void SubNestedSequenceLayer::calSelectedCols( + const MatrixPtr selectedIndices, + const std::vector>& inputSeqInfo) { + selectedRows_.clear(); + + std::vector outSeqStartInfo(1, 0); + std::vector outSubSeqStartInfo(1, 0); + + size_t seqNum = selectedIndices->getHeight(); + size_t beamSize = selectedIndices->getWidth(); + for (size_t i = 0; i < seqNum; ++i) { + for (size_t j = 0; j < beamSize; ++j) { + if (selectedIndices->getElement(i, j) == -1.) break; + int selSubSeqIdx = selectedIndices->getElement(i, j); + CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx); + + size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] - + inputSeqInfoVec_[i][selSubSeqIdx]; + for (size_t k = 0; k < subSeqLen; ++k) + selectedRows_.push_back(inputSeqInfoVec_[i][selSubSeqIdx] + k); + outSubSeqStartInfo.push_back(outSubSeqStartInfo.back() + subSeqLen); + } + outSeqStartInfo.push_back(outSubSeqStartInfo.back()); + } + + if (useGpu_) { + rowIndice_ = IVector::create(selectedRows_.size(), useGpu_); + rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size()); + } else { + rowIndice_ = + IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_); + } + + // create the sequence information for the output. + ICpuGpuVector::resizeOrCreate( + output_.sequenceStartPositions, outSeqStartInfo.size(), false); + output_.sequenceStartPositions->copyFrom( + outSeqStartInfo.data(), outSeqStartInfo.size(), false); + + ICpuGpuVector::resizeOrCreate( + output_.subSequenceStartPositions, outSubSeqStartInfo.size(), false); + output_.subSequenceStartPositions->copyFrom( + outSubSeqStartInfo.data(), outSubSeqStartInfo.size(), false); +} + +void SubNestedSequenceLayer::forward(PassType passType) { + Layer::forward(passType); + + const Argument& inputSeq = getInput(0); + CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer " + << "must be a nested sequence."; + const MatrixPtr selectedIndices = getInputValue(1); + CHECK_EQ(inputSeq.getNumSequences(), selectedIndices->getHeight()); + + if (dynamic_cast(selectedIndices.get())) { + /* + * Currently, the second input for this layer is generated by + * kmax_sequence_score_layer whose output is always stored on CPU, + * or a data_layer which canbe on GPU. + * + * If the second input is on GPU, copy it to CPU memory, because this + * input always uses very few memory, and operations related to it are + * all logic control, not computations. + */ + Matrix::resizeOrCreate(selIdsCpu_, + selectedIndices->getHeight(), + selectedIndices->getWidth(), + false /* trans */, + false /* useGpu */); + selIdsCpu_->copyFrom(*selectedIndices); + } else { + selIdsCpu_ = selectedIndices; + } + + Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions, + inputSeq.subSequenceStartPositions, + inputSeqInfoVec_); + calSelectedCols(selIdsCpu_, inputSeqInfoVec_); + + resetOutput(selectedRows_.size(), getSize()); + getOutputValue()->selectRows(*getInputValue(0), *rowIndice_); +} + +void SubNestedSequenceLayer::backward(const UpdateCallback& callback) { + MatrixPtr inputSeqGrad = getInputGrad(0); + MatrixPtr outputGrad = getOutputGrad(); + + if (inputSeqGrad) outputGrad->addToRows(*inputSeqGrad, *rowIndice_); +} + +} // namespace paddle diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index a43adc7ce7db937bd62ea9bf1533b8a5899c259a..209d0ab9c8d7e8463c8636b1412622a94f359fb1 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -50,7 +50,7 @@ add_unittest_without_exec(test_DetectionOutput test_DetectionOutput.cpp LayerGradUtil.cpp) -add_test(NAME test_DetectionOutput +add_test(NAME test_DetectionOutput COMMAND test_DetectionOutput) ################# test_ConvUnify ####################### add_unittest_without_exec(test_ConvUnify @@ -66,6 +66,16 @@ add_unittest_without_exec(test_BatchNorm add_test(NAME test_BatchNorm COMMAND test_BatchNorm) + + +################# test_KmaxSeqScore ####################### +add_unittest_without_exec(test_KmaxSeqScore + test_KmaxSeqScore.cpp + LayerGradUtil.cpp) + +add_test(NAME test_KmaxSeqScore + COMMAND test_KmaxSeqScore) + ################## test_Evaluator ####################### add_unittest(test_Evaluator test_Evaluator.cpp) diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index 9eca58f1a1baa6fb1c404a91a345bc7f9d6b4acc..fd9cfa1dc7a9028cb2c5c98baca98ffb2a837bac 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -400,7 +400,6 @@ void initDataLayer(TestConfig testConf, const std::vector& labelSeqStartPositions = testConf.inputDefs[i].labelSeqStartPositions; if (labelSeqStartPositions.size() != 0) { - CHECK(!sequenceStartPositions); CHECK_GE(static_cast(labelSeqStartPositions.size()), 2); sequenceStartPositions = @@ -410,6 +409,19 @@ void initDataLayer(TestConfig testConf, useGpu); data.sequenceStartPositions = sequenceStartPositions; } + + const std::vector& labelSubSeqStartPositions = + testConf.inputDefs[i].labelSubSeqStartPositions; + if (labelSubSeqStartPositions.size() != 0) { + CHECK_GE(static_cast(labelSubSeqStartPositions.size()), 2); + + subSequenceStartPositions = + ICpuGpuVector::create(labelSubSeqStartPositions.size(), useGpu); + subSequenceStartPositions->copyFrom(labelSubSeqStartPositions.data(), + labelSubSeqStartPositions.size(), + useGpu); + data.subSequenceStartPositions = subSequenceStartPositions; + } break; } default: diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h index d299b4dd09418589514d99a72f83e1103ace7de1..5debedf5ef6a3262578ca01b335e664f9a334d35 100644 --- a/paddle/gserver/tests/LayerGradUtil.h +++ b/paddle/gserver/tests/LayerGradUtil.h @@ -67,6 +67,7 @@ struct InputDef { bool isStatic; std::vector labelInitValue; std::vector labelSeqStartPositions; + std::vector labelSubSeqStartPositions; MatrixPtr selfDefinedData; InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) { @@ -81,8 +82,10 @@ struct InputDef { InputDef(InputType type, string nameIn, MatrixPtr selfDefinedData, - std::vector selfDefinedSeqStartPos = {}) + std::vector selfDefinedSeqStartPos = {}, + std::vector selfDefinedSubSeqStartPos = {}) : labelSeqStartPositions(selfDefinedSeqStartPos), + labelSubSeqStartPositions(selfDefinedSubSeqStartPos), selfDefinedData(selfDefinedData) { inputType = type; name = nameIn; diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp index b201ba8a5a4146ab28cd96454f434f889d72a968..de93972a5880518dfbfb9f8582e17c594e54b9b8 100644 --- a/paddle/gserver/tests/test_ActivationGrad.cpp +++ b/paddle/gserver/tests/test_ActivationGrad.cpp @@ -57,6 +57,39 @@ TEST(Activation, activation) { } } +void testSequenceSoftmaxAct(bool hasSubseq) { + LOG(INFO) << "test activation: sequence softmax"; + + const size_t size = 1; + TestConfig config; + config.biasSize = 0; + config.layerConfig.set_type("addto"); + config.layerConfig.set_size(size); + config.layerConfig.set_active_type("sequence_softmax"); + config.inputDefs.push_back( + {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA, + "layer_0", + 1, + 0}); + config.layerConfig.add_inputs(); + + for (auto useGpu : {false, true}) { + testLayerGrad(config, + "sequence_softmax", + 100, + /* trans= */ false, + useGpu, + /* useWeight */ true); + } +} + +TEST(SequenceSoftmaxActivation, activation) { + for (auto hasSubseq : {false, true}) { + LOG(INFO) << "hasSubseq = " << hasSubseq; + testSequenceSoftmaxAct(hasSubseq); + } +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); initMain(argc, argv); diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp index 83fcfed46cd568d22237eeef9c0215e4e3ad2666..659eefa31bdb1f2433d03a59d5bf4782c71bdecf 100644 --- a/paddle/gserver/tests/test_BatchNorm.cpp +++ b/paddle/gserver/tests/test_BatchNorm.cpp @@ -21,6 +21,8 @@ limitations under the License. */ #include "paddle/utils/GlobalConstants.h" #include "LayerGradUtil.h" +#include "paddle/cuda/include/hl_batch_norm.h" +#include "paddle/math/tests/TensorCheck.h" #include "paddle/testing/TestUtil.h" using namespace paddle; // NOLINT @@ -117,6 +119,74 @@ TEST(Layer, batchNorm) { CHECK_EQ(static_cast(convLayer->getOutputValue()->getWidth()), 576); } +#ifndef PADDLE_ONLY_CPU +void batchNormInference(int n, int c, int h, int w) { + MatrixPtr input = std::make_shared(n, c * h * w); + MatrixPtr cudnnOut = std::make_shared(n, c * h * w); + MatrixPtr cudaOut = std::make_shared(n, c * h * w); + MatrixPtr cudnnCheck = std::make_shared(n, c * h * w); + MatrixPtr cudaCheck = std::make_shared(n, c * h * w); + input->randomizeUniform(); + cudnnOut->zeroMem(); + cudaOut->zeroMem(); + + MatrixPtr scale = std::make_shared(1, c); + scale->randomizeUniform(); + MatrixPtr bias = std::make_shared(1, c); + bias->randomizeUniform(); + + MatrixPtr movingMean = std::make_shared(1, c); + movingMean->randomizeUniform(); + + MatrixPtr movingVar = std::make_shared(1, c); + movingVar->randomizeUniform(); + movingVar->clip(0.01, 50); + + hl_tensor_descriptor ioDesc; + hl_tensor_descriptor bnDesc; + hl_create_tensor_descriptor(&ioDesc); + hl_create_tensor_descriptor(&bnDesc); + hl_tensor_reshape(ioDesc, n, c, h, w); + hl_tensor_reshape(bnDesc, 1, c, 1, 1); + + double EPS = 1E-5; + hl_batch_norm_forward_inference(ioDesc, + input->getData(), + ioDesc, + cudnnOut->getData(), + bnDesc, + scale->getData(), + bias->getData(), + movingMean->getData(), + movingVar->getData(), + EPS); + + hl_batch_norm_cuda_inference(input->getData(), + cudaOut->getData(), + scale->getData(), + bias->getData(), + movingMean->getData(), + movingVar->getData(), + EPS, + n, + c, + h, + w); + + cudnnCheck->copyFrom(*cudnnOut); + cudaCheck->copyFrom(*cudaOut); + autotest::TensorCheckErr(*cudnnCheck, *cudaCheck); + + hl_destroy_tensor_descriptor(ioDesc); + hl_destroy_tensor_descriptor(bnDesc); +} + +TEST(BatchNorm, Inference) { + batchNormInference(33, 267, 1, 1); + batchNormInference(19, 105, 4, 4); +} +#endif + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); initMain(argc, argv); diff --git a/paddle/gserver/tests/test_KmaxSeqScore.cpp b/paddle/gserver/tests/test_KmaxSeqScore.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f958b4974d45ef65f8f374148a31ad3a6ce7632f --- /dev/null +++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp @@ -0,0 +1,160 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "ModelConfig.pb.h" +#include "paddle/gserver/layers/DataLayer.h" +#include "paddle/trainer/Trainer.h" +#include "paddle/utils/GlobalConstants.h" + +#include "LayerGradUtil.h" +#include "paddle/testing/TestUtil.h" + +using namespace paddle; // NOLINT +using namespace std; // NOLINT + +DECLARE_bool(use_gpu); +DECLARE_int32(gpu_id); +DECLARE_bool(thread_local_rand_use_global_seed); + +vector randSampling(int range, int n) { + CHECK_GE(range, n); + vector num(range); + iota(begin(num), end(num), 0); + if (range == n) return num; + + random_shuffle(begin(num), end(num)); + num.resize(n); + return num; +} + +void genRandomSeqInfo(vector& seqStartPosition, + vector& subSeqStartPosition) { + const int maxSeqNum = 100; + // generate random start position information + int seqNum = 1 + (rand() % maxSeqNum); + seqStartPosition.resize(seqNum + 1, 0); + subSeqStartPosition.resize(1, 0); + + for (int i = 0; i < seqNum; ++i) { + int subSeqLen = 1 + (rand() % maxSeqNum); + for (int j = 0; j < subSeqLen; ++j) + subSeqStartPosition.push_back(subSeqStartPosition.back() + subSeqLen); + seqStartPosition[i + 1] = subSeqStartPosition.back(); + } +} + +void genRandomGroundTruth(real* values, + vector>& groundTruth, + vector& startPos, + size_t beamSize) { + groundTruth.resize(startPos.size() - 1, vector(beamSize, -1)); + for (size_t i = 0; i < startPos.size() - 1; ++i) { + int seqLen = startPos[i + 1] - startPos[i]; + vector pos = + randSampling(seqLen, min(static_cast(beamSize), seqLen)); + for (size_t j = 0; j < pos.size(); ++j) { + groundTruth[i][j] = pos[j]; + values[startPos[i] + pos[j]] = 1.; + } + } +} + +void checkLayerOut(vector> groundTruth, + real* layerOut, + size_t beamSize) { + for (size_t i = 0; i < groundTruth.size(); ++i) { + int begPos = i * beamSize; + vector tmp(layerOut + begPos, layerOut + begPos + beamSize); + sort(begin(tmp), end(tmp)); + sort(begin(groundTruth[i]), end(groundTruth[i])); + for (size_t j = 0; j < beamSize; ++j) CHECK_EQ(tmp[j], groundTruth[i][j]); + } +} + +TEST(Layer, kmaxSeqScoreLayer) { + const size_t maxBeamSize = 100; + int beamSize = 1 + (rand() % maxBeamSize); + + vector seqStartPosition; + vector subSeqStartPosition; + genRandomSeqInfo(seqStartPosition, subSeqStartPosition); + MatrixPtr inValue = + Matrix::create(subSeqStartPosition.back(), 1, false, false); + + for (auto hasSubseq : {false, true}) { + vector> groundTruth; + inValue->randomizeUniform(); + genRandomGroundTruth(inValue->getData(), + groundTruth, + hasSubseq ? subSeqStartPosition : seqStartPosition, + beamSize); + + for (auto useGpu : {false, true}) { + TestConfig config; + config.layerConfig.set_type("kmax_seq_score"); + config.layerConfig.set_beam_size(beamSize); + + if (hasSubseq) { + config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, + "scores", + inValue, + seqStartPosition, + subSeqStartPosition}); + } else { + config.inputDefs.push_back( + {INPUT_SELF_DEFINE_DATA, "scores", inValue, seqStartPosition}); + } + config.layerConfig.add_inputs(); + + // data layer initialize + std::vector dataLayers; + LayerMap layerMap; + vector datas; + initDataLayer( + config, + &dataLayers, + &datas, + &layerMap, + "kmax_seq_score", + 100 /* actually this parameter is unused in self-defined input*/, + false, + useGpu); + // test layer initialize + std::vector parameters; + LayerPtr kmaxSeqScoreLayer; + FLAGS_use_gpu = useGpu; + initTestLayer(config, &layerMap, ¶meters, &kmaxSeqScoreLayer); + kmaxSeqScoreLayer->forward(PASS_TRAIN); + + const MatrixPtr outValue = kmaxSeqScoreLayer->getOutputValue(); + CHECK_EQ(outValue->getHeight(), + hasSubseq ? subSeqStartPosition.size() - 1 + : seqStartPosition.size() - 1); + CHECK_EQ(outValue->getWidth(), beamSize); + checkLayerOut(groundTruth, outValue->getData(), beamSize); + } + } +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + initMain(argc, argv); + FLAGS_thread_local_rand_use_global_seed = true; + srand((size_t)(time(NULL))); + return RUN_ALL_TESTS(); +} diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 8ce8600c6743779899b2685c1c12053922265411..0f312b6ca50bc1e6317251ba785f1c61a224b54e 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -1899,6 +1899,114 @@ TEST(Layer, CropLayer) { } } +vector randSampling(real range, int n) { + CHECK_GE(range, n); + vector num(range); + iota(begin(num), end(num), 0.); + if (range == n) return num; + + random_shuffle(begin(num), end(num)); + num.resize(n); + sort(begin(num), end(num)); + return num; +} + +TEST(Layer, SubNestedSequenceLayer) { + // layer size is not crutial for this layer, + // so use a small layer size in unittest + const int layerSize = 4; + + const int maxSeqNum = 50; + const int maxSeqLen = 50; + const int maxBeamSize = 32; + + srand((size_t)(time(NULL))); + int beamSize = 1 + (rand() % maxBeamSize); + + TestConfig config; + config.layerConfig.set_type("sub_nested_seq"); + config.layerConfig.set_name("sub_nested_seq_layer"); + config.layerConfig.set_size(layerSize); + + int seqNum = 1 + (rand() % maxSeqNum); + + // sequence information for the first input, it is a nested sequence + vector seqStartPos(seqNum + 1, 0); + vector subSeqStartPos(1, 0); + + // selected indices + MatrixPtr selectedIndices = Matrix::create(seqNum, beamSize, false, false); + selectedIndices->one(); + selectedIndices->mulScalar(-1.); + real* indicesData = selectedIndices->getData(); + + for (int i = 0; i < seqNum; ++i) { + int subSeqNum = 1 + (rand() % maxSeqNum); + for (int j = 0; j < subSeqNum; ++j) { + subSeqStartPos.push_back(subSeqStartPos.back() + + (1 + (rand() % maxSeqLen))); + } + vector selSeqs = + randSampling(static_cast(subSeqNum), min(beamSize, subSeqNum)); + memcpy(indicesData + (i * beamSize), + selSeqs.data(), + selSeqs.size() * sizeof(real)); + seqStartPos[i + 1] = subSeqStartPos.back(); + } + + MatrixPtr seqInputPtr = + Matrix::create(seqStartPos.back(), layerSize, false, false); + seqInputPtr->randomizeUniform(); + config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, + "nested_seq_input", + seqInputPtr, + seqStartPos, + subSeqStartPos}); + config.layerConfig.add_inputs(); + config.inputDefs.push_back( + {INPUT_SELF_DEFINE_DATA, "selected_indices", selectedIndices}); + config.layerConfig.add_inputs(); + + for (auto useGpu : {false, true}) { + testLayerGrad(config, + "sub_nested_seq", + /* batchSize */ seqNum, + /* trans */ false, + /* useGpu*/ useGpu, + /* useWeight */ false); + } +} + +TEST(Layer, ClipLayer) { + const size_t batchSize = 128; + const size_t size = 512; + TestConfig config; + config.layerConfig.set_type("clip"); + config.inputDefs.push_back({INPUT_DATA, "input", size, 0}); + LayerInputConfig* input = config.layerConfig.add_inputs(); + ClipConfig* layerConf = input->mutable_clip_conf(); + double p1 = std::rand() / (double)RAND_MAX; + double p2 = std::rand() / (double)RAND_MAX; + layerConf->set_min(std::min(p1, p2)); + layerConf->set_max(std::max(p1, p2)); + for (auto useGpu : {false, true}) { + testLayerGrad(config, "clip", batchSize, false, useGpu, false); + } +} + +TEST(Layer, RowL2NormLayer) { + const size_t batchSize = 128; + const size_t size = 512; + TestConfig config; + config.layerConfig.set_type("row_l2_norm"); + config.layerConfig.set_size(size); + config.inputDefs.push_back({INPUT_DATA, "input", size, 0}); + config.layerConfig.add_inputs(); + for (auto useGpu : {false, true}) { + testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false); + } +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); initMain(argc, argv); diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu index de48b6fac9c7d8125a552022c52353ef6bcef995..5435808fb7f70fdf1ac98815f7fe8890fb85527c 100644 --- a/paddle/math/BaseMatrix.cu +++ b/paddle/math/BaseMatrix.cu @@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include #include +#include +#include #include "BaseMatrix.h" -#include "hl_matrix_ops.cuh" -#include "hl_matrix_base.cuh" -#include "hl_matrix_apply.cuh" -#include "SIMDFunctions.h" #include "MathFunctions.h" +#include "SIMDFunctions.h" +#include "hl_matrix_apply.cuh" +#include "hl_matrix_base.cuh" +#include "hl_matrix_ops.cuh" namespace paddle { const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported."; -template +template template int BaseMatrixT::applyUnary(Op op) { MatrixOffset offset(0, 0); @@ -34,9 +34,11 @@ int BaseMatrixT::applyUnary(Op op) { return 0; } -template +template template -int BaseMatrixT::applyUnary(Op op, int numRows, int numCols, +int BaseMatrixT::applyUnary(Op op, + int numRows, + int numCols, MatrixOffset& offset) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; int dimM = numRows; @@ -56,7 +58,7 @@ int BaseMatrixT::applyUnary(Op op, int numRows, int numCols, return 0; } -template +template template int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b) { CHECK(height_ == b.height_ && width_ == b.width_) @@ -67,18 +69,23 @@ int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b) { return 0; } -template +template template -int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, - MatrixOffset& offset) { +int BaseMatrixT::applyBinary( + Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) { applyBinary(op, b, numRows, numCols, offset, false_type(), false_type()); return 0; } -template +template template -int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, - MatrixOffset& offset, bAsRowVector, bAsColVector) { +int BaseMatrixT::applyBinary(Op op, + BaseMatrixT& b, + int numRows, + int numCols, + MatrixOffset& offset, + bAsRowVector, + bAsColVector) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch"; @@ -91,8 +98,8 @@ int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, T* A = data_; T* B = b.data_; CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimN + offset.aCol_, this->width_); if (!bAsRowVector::value && !bAsColVector::value) { @@ -115,7 +122,7 @@ int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, return 0; } -template +template template int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) { CHECK_EQ(height_, b.height_); @@ -129,21 +136,29 @@ int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) { return 0; } -template +template template -int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, - int numRows, int numCols, +int BaseMatrixT::applyTernary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, + int numRows, + int numCols, MatrixOffset& offset) { applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type()); return 0; } -template +template template -int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, - int numRows, int numCols, MatrixOffset& offset, - cAsRowVector, cAsColVector) { +int BaseMatrixT::applyTernary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, + int numRows, + int numCols, + MatrixOffset& offset, + cAsRowVector, + cAsColVector) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR; @@ -160,10 +175,10 @@ int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, T* B = b.data_; T* C = c.data_; CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); - CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, - offset.cRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); + CAL_MATRIX_START_ADDRESS( + C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_); CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimN + offset.aCol_, this->width_); @@ -180,21 +195,21 @@ int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, } if (true == useGpu_) { - hl_gpu_apply_ternary_op - ( + hl_gpu_apply_ternary_op( op, A, B, C, dimM, dimN, lda, ldb, ldc); } else { - hl_cpu_apply_ternary_op - ( + hl_cpu_apply_ternary_op( op, A, B, C, dimM, dimN, lda, ldb, ldc); } return 0; } -template +template template -int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, +int BaseMatrixT::applyQuaternary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, BaseMatrixT& d) { CHECK_EQ(height_, b.height_); CHECK_EQ(width_, b.width_); @@ -209,10 +224,14 @@ int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, return 0; } -template +template template -int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, - BaseMatrixT& d, int numRows, int numCols, +int BaseMatrixT::applyQuaternary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, + BaseMatrixT& d, + int numRows, + int numCols, MatrixOffset& offset) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; @@ -234,12 +253,12 @@ int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, T* C = c.data_; T* D = d.data_; CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); - CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, - offset.cRow_); - CAL_MATRIX_START_ADDRESS(D, d.height_, d.width_, ldd, offset.dCol_, - offset.dRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); + CAL_MATRIX_START_ADDRESS( + C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_); + CAL_MATRIX_START_ADDRESS( + D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_); CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimN + offset.aCol_, this->width_); @@ -250,22 +269,29 @@ int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, CHECK_LE(dimM + offset.dRow_, d.height_); CHECK_LE(dimN + offset.dCol_, d.width_); if (true == useGpu_) { - hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, - ldc, ldd); + hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd); } else { - hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, - ldc, ldd); + hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd); } return 0; } -template -template +template -int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, - int numRows, int numCols, MatrixOffset& offset, - aAsRowVector, aAsColVector) { +int BaseMatrixT::aggregate(Agg agg, + Op op, + Saver sv, + BaseMatrixT& b, + int numRows, + int numCols, + MatrixOffset& offset, + aAsRowVector, + aAsColVector) { CHECK_EQ(useGpu_, b.useGpu_); int ld = stride_; @@ -273,10 +299,10 @@ int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, T* dst = data_; T* B = b.data_; - CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_, - offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); + CAL_MATRIX_START_ADDRESS( + dst, height_, width_, ld, offset.aCol_, offset.aRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); if (aAsRowVector::value && !aAsColVector::value) { if (useGpu_) { @@ -297,12 +323,21 @@ int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, return 0; } -template -template +template -int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, - BaseMatrixT& c, int numRows, int numCols, - MatrixOffset& offset, aAsRowVector, +int BaseMatrixT::aggregate(Agg agg, + Op op, + Saver sv, + BaseMatrixT& b, + BaseMatrixT& c, + int numRows, + int numCols, + MatrixOffset& offset, + aAsRowVector, aAsColVector) { CHECK_EQ(useGpu_, b.useGpu_); CHECK_EQ(useGpu_, c.useGpu_); @@ -314,28 +349,28 @@ int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, T* dst = data_; T* B = b.data_; T* C = c.data_; - CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_, - offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); - CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, - offset.cRow_); + CAL_MATRIX_START_ADDRESS( + dst, height_, width_, ld, offset.aCol_, offset.aRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); + CAL_MATRIX_START_ADDRESS( + C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_); if (aAsRowVector::value && !aAsColVector::value) { if (useGpu_) { - hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, - ldb, C, ldc); + hl_gpu_matrix_column_op( + agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc); } else { - hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, - ldb, C, ldc); + hl_cpu_matrix_column_op( + agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc); } } else if (!aAsRowVector::value && aAsColVector::value) { if (useGpu_) { - hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, - ldb, C, ldc); + hl_gpu_matrix_row_op( + agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc); } else { - hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, - ldb, C, ldc); + hl_cpu_matrix_row_op( + agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc); } } else { LOG(FATAL) << "not supported"; @@ -350,15 +385,19 @@ int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, */ DEFINE_MATRIX_UNARY_OP(Neg, a = -a); -template -void BaseMatrixT::neg() { applyUnary(unary::Neg()); } +template +void BaseMatrixT::neg() { + applyUnary(unary::Neg()); +} DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a)); -template<> -void BaseMatrixT::exp2() { applyUnary(unary::Exp()); } +template <> +void BaseMatrixT::exp2() { + applyUnary(unary::Exp()); +} DEFINE_MATRIX_UNARY_OP(Log, a = log(a)); -template<> +template <> void BaseMatrixT::log2() { if (useGpu_) { applyUnary(unary::Log()); @@ -368,30 +407,42 @@ void BaseMatrixT::log2() { } DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a)); -template<> -void BaseMatrixT::sqrt2() { applyUnary(unary::Sqrt()); } +template <> +void BaseMatrixT::sqrt2() { + applyUnary(unary::Sqrt()); +} DEFINE_MATRIX_UNARY_OP(Square, a = a * a); -template -void BaseMatrixT::square2() { applyUnary(unary::Square()); } +template +void BaseMatrixT::square2() { + applyUnary(unary::Square()); +} DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a); -template -void BaseMatrixT::reciprocal2() { applyUnary(unary::Reciprocal()); } +template +void BaseMatrixT::reciprocal2() { + applyUnary(unary::Reciprocal()); +} DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a); -template -void BaseMatrixT::abs2() { applyUnary(unary::Abs()); } +template +void BaseMatrixT::abs2() { + applyUnary(unary::Abs()); +} DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0)); -template -void BaseMatrixT::sign2() { applyUnary(unary::Sign()); } +template +void BaseMatrixT::sign2() { + applyUnary(unary::Sign()); +} DEFINE_MATRIX_UNARY_OP(Zero, a = 0); -template -void BaseMatrixT::zero() { applyUnary(unary::Zero()); } +template +void BaseMatrixT::zero() { + applyUnary(unary::Zero()); +} -template +template void BaseMatrixT::zeroAtOffset(int64_t columnOffset, int64_t numColumns) { int numRows = height_; int numCols = numColumns; @@ -400,11 +451,13 @@ void BaseMatrixT::zeroAtOffset(int64_t columnOffset, int64_t numColumns) { } DEFINE_MATRIX_UNARY_OP(One, a = 1); -template -void BaseMatrixT::one() { applyUnary(unary::One()); } +template +void BaseMatrixT::one() { + applyUnary(unary::One()); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p)); -template<> +template <> void BaseMatrixT::pow2(real p) { if (useGpu_) { applyUnary(unary::Pow(p)); @@ -414,44 +467,67 @@ void BaseMatrixT::pow2(real p) { } DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p); -template -void BaseMatrixT::subScalar(T p) { applyUnary(unary::SubScalar(p)); } +template +void BaseMatrixT::subScalar(T p) { + applyUnary(unary::SubScalar(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p); -template -void BaseMatrixT::mulScalar(T p) { applyUnary(unary::MulScalar(p)); } +template +void BaseMatrixT::mulScalar(T p) { + applyUnary(unary::MulScalar(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p); -template -void BaseMatrixT::divScalar(T p) { applyUnary(unary::DivScalar(p)); } +template +void BaseMatrixT::divScalar(T p) { + applyUnary(unary::DivScalar(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p); -template -void BaseMatrixT::assign(T p) { applyUnary(unary::Assign(p)); } +template +void BaseMatrixT::assign(T p) { + applyUnary(unary::Assign(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p); -template -void BaseMatrixT::add(T p) { applyUnary(unary::Add(p)); } +template +void BaseMatrixT::add(T p) { + applyUnary(unary::Add(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2); -template -void BaseMatrixT::add(T p1, T p2) { applyUnary(unary::Add2(p1, p2)); } +template +void BaseMatrixT::add(T p1, T p2) { + applyUnary(unary::Add2(p1, p2)); +} -DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER, +DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, + TWO_PARAMETER, a = a < p1 ? p1 : (a > p2 ? p2 : a)); -template -void BaseMatrixT::clip(T p1, T p2) { applyUnary(unary::Clip(p1, p2)); } +template +void BaseMatrixT::clip(T p1, T p2) { + applyUnary(unary::Clip(p1, p2)); +} -DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, + TWO_PARAMETER, + a = b < p1 ? 0 : (b > p2 ? 0 : 1)); +template +void BaseMatrixT::clipDerivative(BaseMatrixT& b, T p1, T p2) { + applyBinary(binary::ClipDerivative(p1, p2), b); +} + +DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, + ONE_PARAMETER, a = a > p ? 1.0f : 0.0f); -template +template void BaseMatrixT::biggerThanScalar(T p) { applyUnary(unary::BiggerThanScalar(p)); } -DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, - a = a > p ? a : p); -template +DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p); +template void BaseMatrixT::downClip(T p) { applyUnary(unary::DownClip(p)); } @@ -462,12 +538,12 @@ void BaseMatrixT::downClip(T p) { */ DEFINE_MATRIX_BINARY_OP(Add, a += b); -template +template void BaseMatrixT::add(BaseMatrixT& b) { applyBinary(binary::Add(), b); } -template<> +template <> void BaseMatrixT::add(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Add(), b); @@ -478,7 +554,7 @@ void BaseMatrixT::add(BaseMatrixT& b) { } } -template +template void BaseMatrixT::addAtOffset(BaseMatrixT& b, int64_t columnOffset) { if (columnOffset + b.width_ <= width_) { int numRows = height_; @@ -497,43 +573,53 @@ void BaseMatrixT::addAtOffset(BaseMatrixT& b, int64_t columnOffset) { } } -template +template void BaseMatrixT::addP2P(BaseMatrixT& b) { T* A = data_; T* B = b.data_; int dimM = height_; int dimN = width_; - hl_gpu_apply_binary_op, 0, 0> - (binary::Add(), A, B, dimM, dimN, dimN, dimN); + hl_gpu_apply_binary_op, 0, 0>( + binary::Add(), A, B, dimM, dimN, dimN, dimN); } -template +template void BaseMatrixT::addColVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::Add(), b, numRows, numCols, offset, false_type(), + applyBinary(binary::Add(), + b, + numRows, + numCols, + offset, + false_type(), true_type() /* bAsColVector */); } -template +template void BaseMatrixT::addRowVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::Add(), b, numRows, numCols, offset, - true_type() /* bAsRowVector */, false_type()); + applyBinary(binary::Add(), + b, + numRows, + numCols, + offset, + true_type() /* bAsRowVector */, + false_type()); } DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p); -template +template void BaseMatrixT::add(BaseMatrixT& b, T p) { applyBinary(binary::Add1(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p)); -template<> +template <> void BaseMatrixT::pow2(BaseMatrixT& b, real p) { if (useGpu_) { applyBinary(binary::Pow(p), b); @@ -543,36 +629,45 @@ void BaseMatrixT::pow2(BaseMatrixT& b, real p) { } DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b); -template +template void BaseMatrixT::add(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::Add2(p1, p2), b); } -template +template void BaseMatrixT::addBias(BaseMatrixT& b, T scale) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::Add1(scale), b, numRows, numCols, offset, - true_type() /* bAsRowVector */, false_type()); + applyBinary(binary::Add1(scale), + b, + numRows, + numCols, + offset, + true_type() /* bAsRowVector */, + false_type()); } DEFINE_MATRIX_BINARY_OP(Sub, a -= b); -template -void BaseMatrixT::sub(BaseMatrixT& b) { applyBinary(binary::Sub(), b); } +template +void BaseMatrixT::sub(BaseMatrixT& b) { + applyBinary(binary::Sub(), b); +} DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p); -template +template void BaseMatrixT::sub(BaseMatrixT& b, T p) { applyBinary(binary::Sub1(p), b); } DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f); -template -void BaseMatrixT::relu(BaseMatrixT& b) { applyBinary(binary::Relu(), b); } +template +void BaseMatrixT::relu(BaseMatrixT& b) { + applyBinary(binary::Relu(), b); +} DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f)); -template +template void BaseMatrixT::reluDerivative(BaseMatrixT& b) { applyBinary(binary::ReluDerivative(), b); } @@ -582,7 +677,7 @@ DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0; ? THRESHOLD : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))); -template<> +template <> void BaseMatrixT::softrelu(BaseMatrixT& b) { applyBinary(binary::Softrelu(), b); } @@ -592,97 +687,100 @@ DEFINE_MATRIX_BINARY_OP( a *= (1.0 - exp(-1.0 * ((b > THRESHOLD) ? THRESHOLD : ((b < -THRESHOLD) ? (-THRESHOLD) : b))))); -template<> +template <> void BaseMatrixT::softreluDerivative(BaseMatrixT& b) { applyBinary(binary::SoftreluDerivative(), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1; b = b < p2 ? b : p2); -template +template void BaseMatrixT::brelu(BaseMatrixT& b) { - int p1 = 0, p2 = 24; //! TODO(yuyang18): Make p1,p2 configuable. + int p1 = 0, p2 = 24; //! TODO(yuyang18): Make p1,p2 configuable. applyBinary(binary::Brelu(p1, p2), b); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative, TWO_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative, + TWO_PARAMETER, a *= (b > p1 && b < p2) ? 1.0 : 0.0); -template +template void BaseMatrixT::breluDerivative(BaseMatrixT& b) { int p1 = 0, p2 = 24; applyBinary(binary::BreluDerivative(p1, p2), b); } DEFINE_MATRIX_BINARY_OP(Square, b = a * a); -template +template void BaseMatrixT::square2(BaseMatrixT& b) { applyBinary(binary::Square(), b); } DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b); -template +template void BaseMatrixT::squareDerivative(BaseMatrixT& b) { applyBinary(binary::SquareDerivative(), b); } -DEFINE_MATRIX_BINARY_OP(Tanh, - T tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - b = 2.0 / (1.0 + std::exp(tmp)) - 1.0); -template<> +DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + b = 2.0 / (1.0 + std::exp(tmp)) - 1.0); +template <> void BaseMatrixT::tanh(BaseMatrixT& b) { applyBinary(binary::Tanh(), b); } DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b); -template +template void BaseMatrixT::tanhDerivative(BaseMatrixT& b) { applyBinary(binary::TanhDerivative(), b); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanh, TWO_PARAMETER, - b = p1 * - (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)); -template<> +DEFINE_MATRIX_BINARY_PARAMETER_OP( + ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)); +template <> void BaseMatrixT::scaledTanh(BaseMatrixT& b, real p1, real p2) { applyBinary(binary::ScaledTanh(p1, p2), b); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative, TWO_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative, + TWO_PARAMETER, a *= p2 * (p1 - b * b)); -template +template void BaseMatrixT::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::ScaledTanhDerivative(p1 * p1, p2 / p1), b); } DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a); -template +template void BaseMatrixT::reciprocal2(BaseMatrixT& b) { applyBinary(binary::Reciprocal(), b); } DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b); -template +template void BaseMatrixT::reciprocalDerivative(BaseMatrixT& b) { applyBinary(binary::ReciprocalDerivative(), b); } DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a); -template -void BaseMatrixT::abs2(BaseMatrixT& b) { applyBinary(binary::Abs(), b); } +template +void BaseMatrixT::abs2(BaseMatrixT& b) { + applyBinary(binary::Abs(), b); +} DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0); -template +template void BaseMatrixT::absDerivative(BaseMatrixT& b) { applyBinary(binary::AbsDerivative(), b); } -DEFINE_MATRIX_BINARY_OP( - Sigmoid, const T THRESHOLD_MIN = -40.0; const T THRESHOLD_MAX = 13.0; - T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN - : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a); - b = 1.0f / (1.0f + exp(-tmp))); -template<> +DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0; + const T THRESHOLD_MAX = 13.0; + T tmp = (a < THRESHOLD_MIN) + ? THRESHOLD_MIN + : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a); + b = 1.0f / (1.0f + exp(-tmp))); +template <> void BaseMatrixT::sigmoid(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Sigmoid(), b); @@ -716,31 +814,31 @@ void BaseMatrixT::sigmoid(BaseMatrixT& b) { } DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b)); -template +template void BaseMatrixT::sigmoidDerivative(BaseMatrixT& b) { applyBinary(binary::SigmoidDerivative(), b); } DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b); -template +template void BaseMatrixT::expDerivative(BaseMatrixT& b) { applyBinary(binary::ExpDerivative(), b); } DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f); -template +template void BaseMatrixT::sign2(BaseMatrixT& b) { applyBinary(binary::Sign(), b); } DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b)); -template<> +template <> void BaseMatrixT::exp2(BaseMatrixT& b) { applyBinary(binary::Exp(), b); } DEFINE_MATRIX_BINARY_OP(Log, a = log(b)); -template<> +template <> void BaseMatrixT::log2(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Log(), b); @@ -750,13 +848,13 @@ void BaseMatrixT::log2(BaseMatrixT& b) { } DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b)); -template<> +template <> void BaseMatrixT::sqrt2(BaseMatrixT& b) { applyBinary(binary::Sqrt(), b); } DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b)); -template<> +template <> void BaseMatrixT::invSqrt(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::InvSqrt(), b); @@ -768,37 +866,37 @@ void BaseMatrixT::invSqrt(BaseMatrixT& b) { } DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p)); -template +template void BaseMatrixT::isEqualTo(BaseMatrixT& b, T value) { applyBinary(binary::IsEqual(value), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p); -template +template void BaseMatrixT::addScalar(BaseMatrixT& b, T p) { applyBinary(binary::AddScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p); -template +template void BaseMatrixT::subScalar(BaseMatrixT& b, T p) { applyBinary(binary::SubScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p); -template +template void BaseMatrixT::mulScalar(BaseMatrixT& b, T p) { applyBinary(binary::MulScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p); -template +template void BaseMatrixT::divScalar(BaseMatrixT& b, T p) { applyBinary(binary::DivScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b); -template +template void BaseMatrixT::scalarDiv(BaseMatrixT& b, T p) { applyBinary(binary::ScalarDiv(p), b); } @@ -810,20 +908,20 @@ void BaseMatrixT::scalarDiv(BaseMatrixT& b, T p) { DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy, a = -c * log(b) - (1 - c) * log(1 - b)); -template<> +template <> void BaseMatrixT::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::SoftCrossEntropy(), b, c); } DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b))); -template +template void BaseMatrixT::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::SoftCrossEntropyBp(), b, c); } DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy, a = c > 0.5 ? -log(b) : -log(1.0 - b)); -template<> +template <> void BaseMatrixT::binaryLabelCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) { if (useGpu_) { @@ -851,70 +949,73 @@ void BaseMatrixT::binaryLabelCrossEntropy(BaseMatrixT& b, DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp, a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)); -template +template void BaseMatrixT::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::BinaryCrossEntropyBp(), b, c); } DEFINE_MATRIX_TERNARY_OP(Add, a = b + c); -template +template void BaseMatrixT::add(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Add(), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c); -template +template void BaseMatrixT::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) { applyTernary(ternary::Add1(p1, p2), b, c); } DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c); -template +template void BaseMatrixT::sub(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Sub(), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c); -template +template void BaseMatrixT::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) { applyTernary(ternary::Sub1(p1, p2), b, c); } DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c); -template +template void BaseMatrixT::add2(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Add2(), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3, THREE_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3, + THREE_PARAMETER, a = p1 * a + p2 * b + p3 * c); -template +template void BaseMatrixT::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) { applyTernary(ternary::Add3(p1, p2, p3), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate, + THREE_PARAMETER, c = p2 * c - p1 * (b + p3 * a); a = a + c); -template +template void BaseMatrixT::sgdUpdate(BaseMatrixT& b, // grad BaseMatrixT& c, // mom - T p1, // learningRate, - T p2, // momentum, - T p3) { // decayRate + T p1, // learningRate, + T p2, // momentum, + T p3) { // decayRate applyTernary(ternary::SgdUpdate(p1, p2, p3), b, c); } -DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER, +DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate, + THREE_PARAMETER, c = p2 * c - p1 * d * (b + p3 * a); a += c); -template +template void BaseMatrixT::sgdUpdate(BaseMatrixT& b, // grad, BaseMatrixT& c, // mom, BaseMatrixT& d, // lr, - T p1, // learningRate, - T p2, // momentum, - T p3) { // decayRate + T p1, // learningRate, + T p2, // momentum, + T p3) { // decayRate applyQuaternary(quaternary::SgdUpdate(p1, p2, p3), b, c, d); } @@ -922,19 +1023,22 @@ DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b; a = (a > lambda) ? (a - lambda) : (a < -lambda) ? (a + lambda) : 0); -template +template void BaseMatrixT::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) { applyBinary(binary::ApplyL1(learningRate * decayRate), lr); } -template<> +template <> void BaseMatrixT::applyL1(BaseMatrixT& lr, real learningRate, real decayRate) { if (useGpu_) { applyBinary(binary::ApplyL1(learningRate * decayRate), lr); } else { - simd::decayL1(this->data_, this->data_, lr.data_, learningRate * decayRate, + simd::decayL1(this->data_, + this->data_, + lr.data_, + learningRate * decayRate, height_ * width_); } } @@ -943,24 +1047,25 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p; a = (a > lambda) ? (a - lambda) : (a < -lambda) ? (a + lambda) : 0); -template +template void BaseMatrixT::applyL1(T learningRate, T decayRate) { applyUnary(unary::ApplyL1(learningRate * decayRate)); } -template<> +template <> void BaseMatrixT::applyL1(real learningRate, real decayRate) { if (useGpu_) { applyUnary(unary::ApplyL1(learningRate * decayRate)); } else { - simd::decayL1(this->data_, this->data_, learningRate * decayRate, - height_ * width_); + simd::decayL1( + this->data_, this->data_, learningRate * decayRate, height_ * width_); } } -DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2, ONE_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2, + ONE_PARAMETER, a *= (1.0f / (1.0f + p * b))); -template +template void BaseMatrixT::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) { if (useGpu_) { applyBinary(binary::ApplyL2(learningRate * decayRate), lr); @@ -973,32 +1078,33 @@ void BaseMatrixT::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) { } } -template +template void BaseMatrixT::applyL2(T learningRate, T decayRate) { BaseMatrixT::mulScalar(1.0f / (1.0f + learningRate * decayRate)); } DEFINE_MATRIX_BINARY_OP(DotMul, a *= b); -template +template void BaseMatrixT::dotMul(BaseMatrixT& b) { applyBinary(binary::DotMul(), b); } DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c); -template +template void BaseMatrixT::dotMul(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotMul(), b, c); } DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c); -template +template void BaseMatrixT::dotDiv(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotDiv(), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P, + TWO_PARAMETER, a = (b + p1) / (c + p2)); -template +template void BaseMatrixT::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::DotDiv2P(p1, p2), b, c); } @@ -1008,7 +1114,7 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c; ? THRESHOLD : ((a < -THRESHOLD) ? (-THRESHOLD) : a); a = log(1 + exp(a)) - a * d); -template<> +template <> void BaseMatrixT::rankLoss(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d) { @@ -1019,8 +1125,9 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c; a = (a > THRESHOLD) ? THRESHOLD : ((a < -THRESHOLD) ? (-THRESHOLD) : a); - a = exp(a); a = (a / (1 + a) - d)); -template<> + a = exp(a); + a = (a / (1 + a) - d)); +template <> void BaseMatrixT::rankLossBp(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d) { @@ -1033,7 +1140,7 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0; ? -THRESHOLD : b; a = log(1 + exp(x)) - c * x); -template<> +template <> void BaseMatrixT::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::LogisticRegressionLoss(), b, c); } @@ -1043,22 +1150,23 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0; T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD) ? -THRESHOLD : b; - x = exp(x); a = x / (1 + x) - c); -template<> + x = exp(x); + a = x / (1 + x) - c); +template <> void BaseMatrixT::logisticRegressionLossBp(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::LogisticRegressionLossBp(), b, c); } DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f); -template +template void BaseMatrixT::biggerThan(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::BiggerThan(), b, c); } DEFINE_MATRIX_QUATERNARY_OP( BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f); -template +template void BaseMatrixT::biggerThan(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d) { @@ -1066,25 +1174,34 @@ void BaseMatrixT::biggerThan(BaseMatrixT& b, } DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c); -template +template void BaseMatrixT::max2(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Max(), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError, ONE_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError, + ONE_PARAMETER, c += ((a > p) == (b > p)) ? 0.0f : 1.0f); -template -void BaseMatrixT::binaryClassificationError2(size_t destCol, BaseMatrixT& b, - BaseMatrixT& c, T p) { +template +void BaseMatrixT::binaryClassificationError2(size_t destCol, + BaseMatrixT& b, + BaseMatrixT& c, + T p) { CHECK(!useGpu_) << "do not support gpu"; MatrixOffset offset(0, 0, 0, 0, destCol, 0); int numRows = b.height_; int numCols = b.width_; - b.applyTernary(ternary::BinaryClassificationError(p), c, *this, numRows, - numCols, offset, false_type(), true_type() /*cAsColVector*/); + b.applyTernary(ternary::BinaryClassificationError(p), + c, + *this, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } -template<> +template <> void BaseMatrixT::binaryClassificationError(size_t destCol, BaseMatrixT& b, BaseMatrixT& c, @@ -1092,127 +1209,148 @@ void BaseMatrixT::binaryClassificationError(size_t destCol, MatrixOffset offset(destCol, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; - aggregate(aggregate::sum(), base::binary::classificationError(p), - base::binary::add(), b, c, numRows, numCols, offset, false_type(), + aggregate(aggregate::sum(), + base::binary::classificationError(p), + base::binary::add(), + b, + c, + numRows, + numCols, + offset, + false_type(), true_type() /*aAsColVector*/); } -DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3, THREE_PARAMETER, +DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3, + THREE_PARAMETER, a = p1 * b + p2 * c + p3 * d); -template -void BaseMatrixT::add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, - T p2, T p3) { +template +void BaseMatrixT::add3( + BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) { applyQuaternary(quaternary::Add3(p1, p2, p3), b, c, d); } DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c); -template +template void BaseMatrixT::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotMulSquare(), b, c); } DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c); -template +template void BaseMatrixT::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotSquareSquare(), b, c); } DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b); -template +template void BaseMatrixT::dotMulSquare(BaseMatrixT& b) { applyBinary(binary::DotMulSquare(), b); } DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b); -template +template void BaseMatrixT::dotSquareMul(BaseMatrixT& b) { applyBinary(binary::DotSquareMul(), b); } -DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum, THREE_PARAMETER, +DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum, + THREE_PARAMETER, T tmp = p1 * b + p2 * c + p3 * d; a += tmp * tmp); -template -void BaseMatrixT::addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, - T p1, T p2, T p3) { +template +void BaseMatrixT::addSquareSum( + BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) { applyQuaternary(quaternary::AddSquareSum(p1, p2, p3), b, c, d); } DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b); -template +template void BaseMatrixT::addSquare(BaseMatrixT& b, T p) { applyBinary(binary::AddSquare(p), b); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare, TWO_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare, + TWO_PARAMETER, a = p1 * a + p2 * b * b); -template +template void BaseMatrixT::decayAddSquare(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::DecayAddSquare(p1, p2), b); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul, + TWO_PARAMETER, a = p1 * a + p2 * b * b * c * c); -template -void BaseMatrixT::decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1, +template +void BaseMatrixT::decayAddSquareMul(BaseMatrixT& b, + BaseMatrixT& c, + T p1, T p2) { applyTernary(ternary::DecayAddSquareMul(p1, p2), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum, THREE_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum, + THREE_PARAMETER, a = 1 / (p1 * b + p2 * c + p3)); -template -void BaseMatrixT::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, - T p3) { +template +void BaseMatrixT::reciprocalSum( + BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) { applyTernary(ternary::ReciprocalSum(p1, p2, p3), b, c); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, + TWO_PARAMETER, a = 1 / (p1 * b + p2)); -template +template void BaseMatrixT::reciprocal2(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::Reciprocal2(p1, p2), b); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum, + TWO_PARAMETER, T tmp = p1 * b + p2 * c; a *= tmp * tmp); -template -void BaseMatrixT::dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, +template +void BaseMatrixT::dotMulSquareSum(BaseMatrixT& b, + BaseMatrixT& c, + T p1, T p2) { applyTernary(ternary::DotMulSquareSum(p1, p2), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum, + TWO_PARAMETER, T tmp = p1 * b + p2 * c; a = tmp * tmp); -template +template void BaseMatrixT::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::DotSquareSum(p1, p2), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum, + TWO_PARAMETER, a *= p1 * b + p2 * c); -template +template void BaseMatrixT::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::DotMulSum(p1, p2), b, c); } DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0); -template +template void BaseMatrixT::copyAndClear(BaseMatrixT& b) { applyBinary(binary::CopyAndClear(), b); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul, + TWO_PARAMETER, a = p1 * a + p2 * b * c); -template +template void BaseMatrixT::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::AddDotMul(p1, p2), b, c); } DEFINE_MATRIX_BINARY_OP(Assign, a = b;); -template +template void BaseMatrixT::assign(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Assign(), b); @@ -1223,7 +1361,7 @@ void BaseMatrixT::assign(BaseMatrixT& b) { } } -template +template void BaseMatrixT::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) { if (columnOffset + b.width_ <= width_) { int numRows = height_; @@ -1243,24 +1381,31 @@ void BaseMatrixT::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) { } DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp); -template +template void BaseMatrixT::deepSwap(BaseMatrixT& b) { - applyBinary(binary::DeepSwap(), b); + applyBinary(binary::DeepSwap(), b); } -template<> +template <> void BaseMatrixT::rowDotMul(size_t destCol, BaseMatrixT& b, BaseMatrixT& c) { int numRows = b.height_; int numCols = b.width_; MatrixOffset offset(destCol, 0, 0, 0, 0, 0); - aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c, - numRows, numCols, offset, false_type(), + aggregate(aggregate::sum(), + base::binary::mul(), + base::binary::add(), + b, + c, + numRows, + numCols, + offset, + false_type(), true_type() /*aAsColVector*/); } -template +template void BaseMatrixT::rowDotMul2(size_t destCol, BaseMatrixT& b, BaseMatrixT& c) { @@ -1283,17 +1428,24 @@ void BaseMatrixT::rowDotMul2(size_t destCol, } } -template<> +template <> void BaseMatrixT::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; - aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c, - numRows, numCols, offset, true_type() /*aAsRowVector*/, + aggregate(aggregate::sum(), + base::binary::mul(), + base::binary::add(), + b, + c, + numRows, + numCols, + offset, + true_type() /*aAsRowVector*/, false_type()); } -template +template void BaseMatrixT::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) { CHECK(!useGpu_) << "do not support gpu"; @@ -1314,16 +1466,22 @@ void BaseMatrixT::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) { } DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c); -template +template void BaseMatrixT::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::addDotMulMMV(), b, c, numRows, numCols, offset, - true_type() /*cAsRowVector*/, false_type()); + applyTernary(ternary::addDotMulMMV(), + b, + c, + numRows, + numCols, + offset, + true_type() /*cAsRowVector*/, + false_type()); } -template +template void BaseMatrixT::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) { CHECK(!useGpu_) << "do not support gpu"; @@ -1343,16 +1501,22 @@ void BaseMatrixT::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) { } } -template +template void BaseMatrixT::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::DotMul(), b, c, numRows, numCols, offset, - false_type(), true_type() /*cAsColVector*/); + applyTernary(ternary::DotMul(), + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } -template +template void BaseMatrixT::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { CHECK(!useGpu_) << "do not support gpu"; @@ -1372,52 +1536,82 @@ void BaseMatrixT::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { } } -template +template void BaseMatrixT::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, cRow); int numRows = height_; int numCols = width_; - applyTernary(ternary::DotMul(), b, c, numRows, numCols, offset, - true_type() /* cAsRowVector */, false_type() /* cAsColVector */); + applyTernary(ternary::DotMul(), + b, + c, + numRows, + numCols, + offset, + true_type() /* cAsRowVector */, + false_type() /* cAsColVector */); } -template +template void BaseMatrixT::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, cRow); int numRows = height_; int numCols = width_; - applyTernary(ternary::addDotMulMMV(), b, c, numRows, numCols, offset, - true_type() /* cAsRowVector */, false_type() /* cAsColVector */); + applyTernary(ternary::addDotMulMMV(), + b, + c, + numRows, + numCols, + offset, + true_type() /* cAsRowVector */, + false_type() /* cAsColVector */); } -template +template void BaseMatrixT::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::addDotMulMMV(), b, c, numRows, numCols, offset, - false_type(), true_type() /*cAsColVector*/); + applyTernary(ternary::addDotMulMMV(), + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c); -template +template void BaseMatrixT::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::RowAdd(p), b, c, numRows, numCols, offset, - false_type(), true_type() /*cAsColVector*/); + applyTernary(ternary::RowAdd(p), + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c)); -template<> +template <> void BaseMatrixT::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { if (useGpu_) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::RowPow(), b, c, numRows, numCols, offset, - false_type(), true_type() /*cAsColVector*/); + applyTernary(ternary::RowPow(), + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } else { size_t height = this->height_; size_t width = this->width_; @@ -1434,44 +1628,64 @@ void BaseMatrixT::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { } } -template +template void BaseMatrixT::mulRowVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::DotMul(), b, numRows, numCols, offset, - true_type() /* bAsRowVector */, false_type()); + applyBinary(binary::DotMul(), + b, + numRows, + numCols, + offset, + true_type() /* bAsRowVector */, + false_type()); } DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b); -template +template void BaseMatrixT::divRowVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::DotDiv(), b, numRows, numCols, offset, - true_type() /* bAsRowVector */, false_type()); + applyBinary(binary::DotDiv(), + b, + numRows, + numCols, + offset, + true_type() /* bAsRowVector */, + false_type()); } -template +template void BaseMatrixT::mulColVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::DotMul(), b, numRows, numCols, offset, - false_type(), true_type() /* bAsColVector */); + applyBinary(binary::DotMul(), + b, + numRows, + numCols, + offset, + false_type(), + true_type() /* bAsColVector */); } -template +template void BaseMatrixT::divColVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::DotDiv(), b, numRows, numCols, offset, - false_type(), true_type() /* bAsColVector */); + applyBinary(binary::DotDiv(), + b, + numRows, + numCols, + offset, + false_type(), + true_type() /* bAsColVector */); } -template<> +template <> template int BaseMatrixT::applyRow(Agg agg, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); @@ -1479,13 +1693,20 @@ int BaseMatrixT::applyRow(Agg agg, BaseMatrixT& b) { size_t numCols = b.width_; CHECK_EQ(height_, numRows); CHECK_EQ(width_, 1UL); - aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows, - numCols, offset, false_type(), true_type() /*aAsColVector*/); + aggregate(agg, + base::unary::identity(), + base::binary::second(), + b, + numRows, + numCols, + offset, + false_type(), + true_type() /*aAsColVector*/); return 0; } -template<> +template <> template int BaseMatrixT::applyRow(Agg agg, Saver sv, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); @@ -1493,16 +1714,25 @@ int BaseMatrixT::applyRow(Agg agg, Saver sv, BaseMatrixT& b) { size_t numCols = b.width_; CHECK_EQ(height_, numRows); CHECK_EQ(width_, 1UL); - aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset, - false_type(), true_type() /*aAsColVector*/); + aggregate(agg, + base::unary::identity(), + sv, + b, + numRows, + numCols, + offset, + false_type(), + true_type() /*aAsColVector*/); return 0; } -template<> +template <> template -int BaseMatrixT::applyRow( - Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) { +int BaseMatrixT::applyRow(Agg agg, + real scaleDest, + real scaleAgg, + BaseMatrixT& b) { if (scaleDest != 0) { applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b); } else { @@ -1514,10 +1744,10 @@ int BaseMatrixT::applyRow( return 0; } -template<> +template <> template -int BaseMatrixT::applyRow(Agg agg, Op op, Saver sv, - BaseMatrixT& b, BaseMatrixT& c) { +int BaseMatrixT::applyRow( + Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, 0); size_t numRows = b.height_; size_t numCols = b.width_; @@ -1525,16 +1755,27 @@ int BaseMatrixT::applyRow(Agg agg, Op op, Saver sv, CHECK_EQ(width_, 1UL); CHECK_EQ(c.height_, numRows); CHECK_EQ(c.width_, numCols); - aggregate(agg, op, sv, - b, c, numRows, numCols, offset, - false_type(), true_type() /*aAsColVector*/); + aggregate(agg, + op, + sv, + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*aAsColVector*/); return 0; } -template<> +template <> template -int BaseMatrixT::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg, - BaseMatrixT& b, BaseMatrixT& c) { +int BaseMatrixT::applyRow(Agg agg, + Op op, + real scaleDest, + real scaleAgg, + BaseMatrixT& b, + BaseMatrixT& c) { if (scaleDest != 0) { applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c); } else { @@ -1546,7 +1787,7 @@ int BaseMatrixT::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg, return 0; } -template<> +template <> template int BaseMatrixT::applyCol(Agg agg, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); @@ -1554,13 +1795,20 @@ int BaseMatrixT::applyCol(Agg agg, BaseMatrixT& b) { size_t numCols = b.width_; CHECK_EQ(width_, numCols); CHECK_EQ(height_, 1UL); - aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows, - numCols, offset, true_type() /*aAsRowVector*/, false_type()); + aggregate(agg, + base::unary::identity(), + base::binary::second(), + b, + numRows, + numCols, + offset, + true_type() /*aAsRowVector*/, + false_type()); return 0; } -template<> +template <> template int BaseMatrixT::applyCol(Agg agg, Saver sv, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); @@ -1568,16 +1816,25 @@ int BaseMatrixT::applyCol(Agg agg, Saver sv, BaseMatrixT& b) { size_t numCols = b.width_; CHECK_EQ(width_, numCols); CHECK_EQ(height_, 1UL); - aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset, - true_type() /*aAsRowVector*/, false_type()); + aggregate(agg, + base::unary::identity(), + sv, + b, + numRows, + numCols, + offset, + true_type() /*aAsRowVector*/, + false_type()); return 0; } -template<> +template <> template -int BaseMatrixT::applyCol( - Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) { +int BaseMatrixT::applyCol(Agg agg, + real scaleDest, + real scaleAgg, + BaseMatrixT& b) { if (scaleDest != 0) { applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b); } else { @@ -1589,48 +1846,51 @@ int BaseMatrixT::applyCol( return 0; } -template<> +template <> void BaseMatrixT::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) { applyRow(aggregate::sum(), scaleDest, scaleSum, b); } -template<> +template <> void BaseMatrixT::maxRows(BaseMatrixT& b) { applyRow(aggregate::max(), b); } -template<> +template <> void BaseMatrixT::minRows(BaseMatrixT& b) { applyRow(aggregate::min(), b); } -template<> +template <> void BaseMatrixT::maxCols(BaseMatrixT& b) { applyCol(aggregate::max(), b); } -template<> +template <> void BaseMatrixT::minCols(BaseMatrixT& b) { applyCol(aggregate::min(), b); } -template<> +template <> void BaseMatrixT::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) { applyCol(aggregate::sum(), scaleDest, scaleSum, b); } -template<> -void BaseMatrixT::sumOfSquaredDiffs( - BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) { - applyRow(aggregate::sum(), base::binary::squaredDiff(), - scaleDest, scaleSum, b, c); +template <> +void BaseMatrixT::sumOfSquaredDiffs(BaseMatrixT& b, + BaseMatrixT& c, + real scaleSum, + real scaleDest) { + applyRow( + aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c); } -template<> -void BaseMatrixT::sumOfProducts( - BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) { - applyRow(aggregate::sum(), base::binary::mul(), - scaleDest, scaleSum, b, c); +template <> +void BaseMatrixT::sumOfProducts(BaseMatrixT& b, + BaseMatrixT& c, + real scaleSum, + real scaleDest) { + applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c); } template class BaseMatrixT; diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h index 120d69f718b954925438fbd2119d69f0be13b3e9..12ad2d45a0bbff182e78da6efb3c5ff4c6b59b55 100644 --- a/paddle/math/BaseMatrix.h +++ b/paddle/math/BaseMatrix.h @@ -488,6 +488,13 @@ public: */ void clip(T p1, T p2); + /** + * this = b < low ? 0 : 1 + * + * this = b > high ? 0 : 1 + */ + void clipDerivative(BaseMatrixT& b, T p1, T p2); + /** * @code * a = a > p ? 1.0f : 0.0f diff --git a/paddle/math/MathUtils.cpp b/paddle/math/MathUtils.cpp index 5bbc3e4e3725f186373072440a93f967178e0b27..980b6e138873046468f278c2f0b16938be82b81c 100644 --- a/paddle/math/MathUtils.cpp +++ b/paddle/math/MathUtils.cpp @@ -25,7 +25,7 @@ namespace paddle { */ void sparseRand( int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) { - CHECK(size_t(nnz) > size_t(1)); + CHECK(size_t(nnz) >= size_t(1)); int* cpuMajor; int* cpuMinor; CpuIVector cpuMinorVec(nnz); diff --git a/paddle/math/TrainingAlgorithmOp.cu b/paddle/math/TrainingAlgorithmOp.cu index 72ff077270382d52bfcd340cc64d9abf49d1705d..fc746b85339de596d5ddc5811a8164094c13f63f 100644 --- a/paddle/math/TrainingAlgorithmOp.cu +++ b/paddle/math/TrainingAlgorithmOp.cu @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/utils/Logging.h" #include "BaseMatrix.h" #include "TrainingAlgorithmOp.h" +#include "paddle/utils/Logging.h" #if __cplusplus > 199711L @@ -32,10 +32,10 @@ void sparseMomentumApply(BaseMatrix& value, real tau, real learningRate) { auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad); - auto expr2 = momV.lazyAssign( - momV + (tau * alpha * gamma * learningRate) * grad); - auto expr3 = value.lazyAssign( - (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV); + auto expr2 = + momV.lazyAssign(momV + (tau * alpha * gamma * learningRate) * grad); + auto expr3 = value.lazyAssign((tau / beta + (real)1 / alpha) * momU + + ((real)1 / beta) * momV); AssignEvaluate(expr1, expr2, expr3); } @@ -52,12 +52,12 @@ void adadeltaApply(BaseMatrix& value, real momentum, real decayRate) { auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square()); - auto expr2 = lr.lazyAssign( - ((accum_update + epsilon) / (accum + epsilon)).sqrt()); - auto expr3 = accum_update.lazyAssign( - rou * accum_update + ((real)1 - rou) * (grad * lr).square()); - auto expr4 = mom.lazyAssign( - mom * momentum - learningRate * lr * (grad + value * decayRate)); + auto expr2 = + lr.lazyAssign(((accum_update + epsilon) / (accum + epsilon)).sqrt()); + auto expr3 = accum_update.lazyAssign(rou * accum_update + + ((real)1 - rou) * (grad * lr).square()); + auto expr4 = mom.lazyAssign(mom * momentum - + learningRate * lr * (grad + value * decayRate)); auto expr5 = value.lazyAssign(value + mom); AssignEvaluate(expr1, expr2, expr3, expr4, expr5); @@ -74,10 +74,10 @@ void adagradApply(BaseMatrix& value, real momentum, real decayRate) { auto expr1 = accum.lazyAssign(accum + grad.square()); - auto expr2 = lr.lazyAssign( - (accum_buffer + accum + epsilon).sqrt().reciprocal()); - auto expr3 = mom.lazyAssign( - mom * momentum - learningRate * lr * (grad + value * decayRate)); + auto expr2 = + lr.lazyAssign((accum_buffer + accum + epsilon).sqrt().reciprocal()); + auto expr3 = mom.lazyAssign(mom * momentum - + learningRate * lr * (grad + value * decayRate)); auto expr4 = value.lazyAssign(value + mom); AssignEvaluate(expr1, expr2, expr3, expr4); @@ -98,8 +98,8 @@ void rmspropApply(BaseMatrix& value, bool firstTime) { auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad); auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal()); - auto expr4 = mom.lazyAssign( - mom * momentum - learningRate * lr * (grad + value * decayRate)); + auto expr4 = mom.lazyAssign(mom * momentum - + learningRate * lr * (grad + value * decayRate)); auto expr5 = value.lazyAssign(value + mom); if (firstTime) { @@ -107,8 +107,8 @@ void rmspropApply(BaseMatrix& value, AssignEvaluate(expr1, expr2, expr3, expr4, expr5); } else { - auto expr1 = g.lazyAssign( - accumulatedRou * g + ((real)1 - rou) * grad.square()); + auto expr1 = + g.lazyAssign(accumulatedRou * g + ((real)1 - rou) * grad.square()); AssignEvaluate(expr1, expr2, expr3, expr4, expr5); } @@ -127,8 +127,8 @@ void decayedAdagradApply(BaseMatrix& value, real decayRate, bool firstTime) { auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal()); - auto expr3 = mom.lazyAssign( - mom * momentum - learningRate * lr * (grad + value * decayRate)); + auto expr3 = mom.lazyAssign(mom * momentum - + learningRate * lr * (grad + value * decayRate)); auto expr4 = value.lazyAssign(value + mom); if (firstTime) { @@ -136,8 +136,8 @@ void decayedAdagradApply(BaseMatrix& value, AssignEvaluate(expr1, expr2, expr3, expr4); } else { - auto expr1 = accum.lazyAssign( - accumulatedRou * accum + ((real)1 - rou) * grad.square()); + auto expr1 = accum.lazyAssign(accumulatedRou * accum + + ((real)1 - rou) * grad.square()); AssignEvaluate(expr1, expr2, expr3, expr4); } @@ -153,13 +153,12 @@ void adamApply(BaseMatrix& value, real beta2_power, real epsilon, real learningRate) { - real alpha = learningRate * - std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); + real alpha = + learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad); auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square()); - auto expr3 = value.lazyAssign( - value - (mom * alpha) / (v.sqrt() + epsilon)); + auto expr3 = value.lazyAssign(value - (mom * alpha) / (v.sqrt() + epsilon)); AssignEvaluate(expr1, expr2, expr3); } @@ -173,10 +172,10 @@ void adamaxApply(BaseMatrix& value, int64_t step, real alpha) { auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad); - auto expr2 = u.lazyAssign( - (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs())); + auto expr2 = + u.lazyAssign((beta2 * u > grad.abs()).condition(beta2 * u, grad.abs())); auto expr3 = value.lazyAssign( - value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u)); + value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u)); AssignEvaluate(expr1, expr2, expr3); } @@ -322,8 +321,8 @@ void adamApply(BaseMatrix& value, real beta2_power, real epsilon, real learningRate) { - real alpha = learningRate * - std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); + real alpha = + learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t; mom = beta1 * mom + ((real)1 - beta1) * grad; @@ -331,7 +330,7 @@ void adamApply(BaseMatrix& value, // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2 v = beta2 * v + ((real)1 - beta2) * grad.square(); - value -= (mom * alpha) / (v.sqrt() + epsilon); + value -= (mom * alpha) / (v.sqrt() + epsilon); } void adamaxApply(BaseMatrix& value, diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu index 40e38434fa328bba8be6e1b8e509023d615899c1..31b693afa8bd50f77a8efb67769e6215dd755bd3 100644 --- a/paddle/math/tests/test_Tensor.cu +++ b/paddle/math/tests/test_Tensor.cu @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include "paddle/math/Matrix.h" #include "TensorCheck.h" +#include "paddle/math/Matrix.h" using paddle::Matrix; using paddle::CpuMatrix; @@ -26,25 +26,25 @@ using paddle::GpuIVector; using autotest::TensorCheckEqual; using autotest::TensorCheckErr; -#define INIT_UNARY(A1, A2) \ - Tensor A1(height, width); \ - Tensor A2(height, width); \ - A1.randomizeUniform(); \ - A2.copyFrom(A1) -#define INIT_BINARY(A1, A2, B) \ - INIT_UNARY(A1, A2); \ - Tensor B(height, width); \ - B.randomizeUniform() -#define INIT_TERNARY(A1, A2, B, C) \ - INIT_BINARY(A1, A2, B); \ - Tensor C(height, width); \ - C.randomizeUniform() -#define INIT_QUATERNARY(A1, A2, B, C, D) \ - INIT_TERNARY(A1, A2, B, C); \ - Tensor D(height, width); \ - D.randomizeUniform() - -template +#define INIT_UNARY(A1, A2) \ + Tensor A1(height, width); \ + Tensor A2(height, width); \ + A1.randomizeUniform(); \ + A2.copyFrom(A1) +#define INIT_BINARY(A1, A2, B) \ + INIT_UNARY(A1, A2); \ + Tensor B(height, width); \ + B.randomizeUniform() +#define INIT_TERNARY(A1, A2, B, C) \ + INIT_BINARY(A1, A2, B); \ + Tensor C(height, width); \ + C.randomizeUniform() +#define INIT_QUATERNARY(A1, A2, B, C, D) \ + INIT_TERNARY(A1, A2, B, C); \ + Tensor D(height, width); \ + D.randomizeUniform() + +template struct TestUnaryMatrix { typedef std::function UnaryFunc; @@ -59,7 +59,7 @@ struct TestUnaryMatrix { } }; -template +template struct TestBinaryMatrix { typedef std::function BinaryFunc; @@ -74,10 +74,10 @@ struct TestBinaryMatrix { } }; -template +template struct TestTernaryMatrix { - typedef std::function TernaryFunc; + typedef std::function + TernaryFunc; explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) { for (auto height : {1, 11, 73, 128, 200, 330}) { @@ -90,10 +90,11 @@ struct TestTernaryMatrix { } }; -template +template struct TestQuaternaryMatrix { typedef std::function QuaternaryFunc; + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)> + QuaternaryFunc; explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) { for (auto height : {1, 11, 73, 128, 200, 330}) { @@ -106,7 +107,7 @@ struct TestQuaternaryMatrix { } }; -template +template struct TestUnaryVectorT { typedef std::function UnaryFunc; @@ -142,11 +143,11 @@ void SetTensorValue(Matrix& matrix, real value) { } } -template +template void testTensorAddScalar(Tensor& A1, Tensor& A2) { real p1 = 2.5; real p2 = 3.0; - A1.add(p1); // a += p + A1.add(p1); // a += p A2 += p1; TensorCheckEqual(A1, A2); @@ -155,7 +156,7 @@ void testTensorAddScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorSubScalar(Tensor& A1, Tensor& A2) { real p = 2.5; A1.subScalar(p); // a -= p @@ -163,7 +164,7 @@ void testTensorSubScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorMulScalar(Tensor& A1, Tensor& A2) { real p = 2.5; A1.mulScalar(p); // a *= p @@ -177,7 +178,7 @@ void testTensorMulScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorDivScalar(Tensor& A1, Tensor& A2) { real p = 2.5; A1.divScalar(p); // a /= p @@ -185,44 +186,44 @@ void testTensorDivScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorNeg(Tensor& A1, Tensor& A2) { A1.neg(); // a = -a A2 = -A2; TensorCheckEqual(A1, A2); } -template +template void testTensorAbs(Tensor& A1, Tensor& A2) { A1.abs2(); // a = a > 0 ? a : -a A2 = A2.abs(); TensorCheckEqual(A1, A2); } -template +template void testTensorSquare(Tensor& A1, Tensor& A2) { A1.square2(); // a = a * a A2 = A2.square(); TensorCheckEqual(A1, A2); } -template +template void testTensorReciprocal(Tensor& A1, Tensor& A2) { A1.reciprocal2(); // a = 1.0f / a A2 = A2.reciprocal(); TensorCheckEqual(A1, A2); } -template +template void testTensorSign(Tensor& A1, Tensor& A2) { A1.sign2(); // a = (a > 0) - (a < 0) A2 = A2.sign(); TensorCheckEqual(A1, A2); } -template +template void testTensorAssign(Tensor& A1, Tensor& A2) { - A1.assign(1.5); // a = p + A1.assign(1.5); // a = p A2 = A2.constant(1.5); TensorCheckEqual(A1, A2); @@ -235,7 +236,7 @@ void testTensorAssign(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testUnaryBaseOp(Tensor& A1, Tensor& A2) { testTensorAddScalar(A1, A2); testTensorSubScalar(A1, A2); @@ -249,9 +250,9 @@ void testUnaryBaseOp(Tensor& A1, Tensor& A2) { testTensorAssign(A1, A2); } -template +template void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) { - A1.add(2); // a += p + A1.add(2); // a += p A2 += 2; TensorCheckEqual(A1, A2); @@ -266,46 +267,46 @@ void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) { TEST(Unary, BaseOp) { TestUnaryMatrix testCpuMatrix(testUnaryBaseOp); TestUnaryVectorT testCpuVector(testUnaryBaseOp); - TestUnaryVectorT - testCpuIVector(testUnaryBaseOpInt); + TestUnaryVectorT testCpuIVector( + testUnaryBaseOpInt); #ifndef PADDLE_ONLY_CPU TestUnaryMatrix testGpuMatrix(testUnaryBaseOp); TestUnaryVectorT testGpuVector(testUnaryBaseOp); - TestUnaryVectorT - testGpuIVector(testUnaryBaseOpInt); + TestUnaryVectorT testGpuIVector( + testUnaryBaseOpInt); #endif } -template +template void testTensorExp(Tensor& A1, Tensor& A2) { A1.exp2(); // a = exp(a) A2 = A2.exp(); TensorCheckErr(A1, A2); } -template +template void testTensorLog(Tensor& A1, Tensor& A2) { A1.log2(); // a = log(a) A2 = A2.log(); TensorCheckErr(A1, A2); } -template +template void testTensorSqrt(Tensor& A1, Tensor& A2) { A1.sqrt2(); // a = sqrt(a) A2 = A2.sqrt(); TensorCheckErr(A1, A2); } -template +template void testTensorPow(Tensor& A1, Tensor& A2) { A1.pow2(3.2); // a = pow(a, p) A2 = A2.pow(3.2); TensorCheckErr(A1, A2); } -template +template void testUnayrMathOp(Tensor& A1, Tensor& A2) { testTensorExp(A1, A2); testTensorLog(A1, A2); @@ -321,7 +322,7 @@ TEST(Unary, MathOp) { #endif } -template +template void testTensorClip(Tensor& A1, Tensor& A2) { real p1 = 0.003f; real p2 = 0.877f; @@ -331,7 +332,7 @@ void testTensorClip(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) { real p = 0.5f; A1.biggerThanScalar(p); // a = a > p ? 1.0f : 0.0f @@ -339,7 +340,7 @@ void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorapplyL1(Tensor& A1, Tensor& A2) { /** * T lambda = p; @@ -351,14 +352,15 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2) { real learningRate = 0.7f; real decayRate = 0.6f; A1.applyL1(learningRate, decayRate); - A2 = (A2 > (learningRate * decayRate)).condition( - (A2 - (learningRate * decayRate)), - (A2 < -(learningRate * decayRate)).condition( - (A2 + (learningRate * decayRate)), (real)0.0)); + A2 = (A2 > (learningRate * decayRate)) + .condition( + (A2 - (learningRate * decayRate)), + (A2 < -(learningRate * decayRate)) + .condition((A2 + (learningRate * decayRate)), (real)0.0)); TensorCheckEqual(A1, A2); } -template +template void testUnayrCompareOp(Tensor& A1, Tensor& A2) { testTensorClip(A1, A2); testTensorBiggerThanScalar(A1, A2); @@ -377,7 +379,7 @@ TEST(Unary, CompareOp) { #endif } -template +template void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) { real p1 = 2.5; real p2 = 3.2; @@ -406,7 +408,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) { real p = 2.5; A1.sub(B); // a -= b @@ -422,7 +424,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) { real p = 2.5; A1.mulScalar(B, p); // a = b * p @@ -442,7 +444,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) { real p = 2.5; A1.divScalar(B, p); // a = b / p @@ -454,28 +456,28 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) { A1.assign(B); // a = b A2 = B; TensorCheckEqual(A1, A2); } -template +template void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) { - B.square2(A1); // b = a * a + B.square2(A1); // b = a * a A2 = B.square(); TensorCheckEqual(A1, A2); } -template +template void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.squareDerivative(B); // a *= 2.0 * b A2 = A2 * (real)2.0 * B; TensorCheckEqual(A1, A2); } -template +template void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) { B.reciprocal2(A1); // b = 1.0f / a A2 = B.reciprocal(); @@ -490,33 +492,33 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) { real learningRate = 0.7f; real decayRate = 1.2f; A1.applyL2(B, learningRate, decayRate); // a *= (1.0f / (1.0f + p * b)) - A2 *= (B.constant(1.0f) + - B.constant(learningRate * decayRate) * B).reciprocal(); + A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B) + .reciprocal(); TensorCheckEqual(A1, A2); } -template +template void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.reciprocalDerivative(B); // a *= -b * b A2 *= (-B) * B; TensorCheckEqual(A1, A2); } -template +template void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) { B.sign2(A1); // b = a > 0.0f ? 1.0f : -1.0f A2 = B.sign(); TensorCheckEqual(A1, A2); } -template +template void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) { B.abs2(A1); // b = a > 0.0f ? a : -a A2 = B.abs(); TensorCheckEqual(A1, A2); } -template +template void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) { testTensorAdd(A1, A2, B); testTensorSub(A1, A2, B); @@ -539,7 +541,7 @@ TEST(Binary, BaseOp) { #endif } -template +template void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) { // a = exp(b) A1.exp2(B); @@ -547,14 +549,14 @@ void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckErr(A1, A2); } -template +template void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.expDerivative(B); // a *= b A2 *= B; TensorCheckEqual(A1, A2); } -template +template void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) { // a = log(b) A1.log2(B); @@ -562,7 +564,7 @@ void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckErr(A1, A2); } -template +template void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) { // a = sqrt(b) A1.sqrt2(B); @@ -570,7 +572,7 @@ void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckErr(A1, A2); } -template +template void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) { // a = 1.0f / sqrt(b) A1.invSqrt(B); @@ -578,14 +580,14 @@ void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckErr(A1, A2); } -template +template void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) { A1.pow2(B, 2.5f); // a = pow(b, p) A2 = B.pow(2.5f); TensorCheckErr(A1, A2); } -template +template void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) { /* * const T THRESHOLD = 40.0; @@ -597,12 +599,14 @@ void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) { real THRESHOLD = 40.0; A2 = (B.constant(1.0f) + - (B > THRESHOLD).condition( - THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)).exp()).log(); + (B > THRESHOLD) + .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)) + .exp()) + .log(); TensorCheckErr(A1, A2); } -template +template void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { /* * const T THRESHOLD = 40.0; @@ -612,14 +616,16 @@ void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { */ A1.softreluDerivative(B); real THRESHOLD = 40.0; - A2 = A2 * (B.constant(1.0f) - - (B.constant(-1.0f) * - (B > THRESHOLD).condition( - THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))).exp()); + A2 = A2 * + (B.constant(1.0f) - + (B.constant(-1.0f) * + (B > THRESHOLD) + .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))) + .exp()); TensorCheckErr(A1, A2); } -template +template void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) { /* const T THRESHOLD_MIN = -40.0; @@ -632,46 +638,47 @@ void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) { const real THRESHOLD_MIN = -40.0; const real THRESHOLD_MAX = 13.0; - auto tmp = (B < THRESHOLD_MIN).condition( - THRESHOLD_MIN, (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B)); + auto tmp = (B < THRESHOLD_MIN) + .condition(THRESHOLD_MIN, + (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B)); A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal(); TensorCheckErr(A1, A2); } -template +template void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.sigmoidDerivative(B); // a *= b * (1 - b) A2 *= B * (B.constant(1.0f) - B); TensorCheckEqual(A1, A2); } -template +template void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) { B.tanh(A1); // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0 A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f; TensorCheckErr(A1, A2); } -template +template void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.tanhDerivative(B); // a *= 1 - b * b A2 *= B.constant(1.0f) - B * B; TensorCheckEqual(A1, A2); } -template +template void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) { real p1 = 2.5; real p2 = 3.1; // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0) B.scaledTanh(A1, p1, p2); A2 = B.constant(p1) * - (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) - - (real)1.0); + (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) - + (real)1.0); TensorCheckErr(A1, A2); } -template +template void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) { real p1 = 2.5; real p2 = 3.1; @@ -681,7 +688,7 @@ void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) { testTensorTanhDerivative(A1, A2, B); testTensorScaledTanhDerivative(A1, A2, B); @@ -708,21 +715,21 @@ TEST(Binary, MathOp) { #endif } -template +template void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) { B.relu(A1); // b = a > 0.0f ? a : 0.0f A2 = (B > (real)0.0f).condition(B, (real)0.0f); TensorCheckEqual(A1, A2); } -template +template void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.reluDerivative(B); // a *= (b > 0.0f ? 1.0f : 0.0f) A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0); TensorCheckEqual(A1, A2); } -template +template void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) { /* * b = a > p1 ? a : p1 @@ -736,7 +743,7 @@ void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { SetTensorValue(B, 32.0f); /* @@ -748,15 +755,15 @@ void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.absDerivative(B); // a = (b > 0) ? a : (b < 0) ? -a : 0 - A2 = (B > (real)0.0f).condition(A2, - (B < (real)0.0f).condition(-A2, (real)0.0f)); + A2 = (B > (real)0.0f) + .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f)); TensorCheckEqual(A1, A2); } -template +template void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) { real p = 0.613; SetTensorValue(B, p); @@ -765,7 +772,7 @@ void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) { /** * T lambda = p * b; @@ -778,12 +785,13 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) { real decayRate = 0.6f; A1.applyL1(B, learningRate, decayRate); auto lambda = B.constant(learningRate * decayRate) * B; - A2 = (A2 > lambda).condition( - (A2 - lambda), (A2 < -lambda).condition((A2 + lambda), (real)0.0f)); + A2 = (A2 > lambda) + .condition((A2 - lambda), + (A2 < -lambda).condition((A2 + lambda), (real)0.0f)); TensorCheckEqual(A1, A2); } -template +template void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) { B.subScalar(0.5f); SetTensorValue(B, 0.0f); @@ -807,7 +815,7 @@ TEST(Binary, CompareOp) { #endif } -template +template void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.add(B, C); // a = b + c A2 = B + C; @@ -833,7 +841,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.sub(B, C); // a = b - c A2 = B - C; @@ -846,7 +854,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.dotMul(B, C); // a = b * c A2 = B * C; @@ -892,7 +900,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.dotDiv(B, C); // a = (b == 0.0) ? 0.0 : b / c A2 = (B == (real)0.0).condition((real)0.0, B / C); @@ -905,7 +913,7 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { real p1 = 1.5; real p2 = 2.5; @@ -915,14 +923,14 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.softCrossEntropy(B, C); // a = -c * log(b) - (1 - c) * log(1 - b) A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log(); TensorCheckErr(A1, A2); } -template +template void testTensorSoftCrossEntropyBp(Tensor& A1, Tensor& A2, Tensor& B, @@ -932,7 +940,7 @@ void testTensorSoftCrossEntropyBp(Tensor& A1, TensorCheckEqual(A1, A2); } -template +template void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { testTensorAdd(A1, A2, B, C); testTensorSub(A1, A2, B, C); @@ -952,30 +960,30 @@ TEST(Ternary, BaseOp) { #endif } -template +template void testTensorBinaryLabelCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.binaryLabelCrossEntropy(B, C); // a = c > 0.5 ? -log(b) : -log(1.0 - b) - A2 = (C > (real)0.5).condition( - -(B.log()), -((B.constant(1.0f) - B).log())); + A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log())); TensorCheckErr(A1, A2); } -template +template void testTensorBinaryLabelCrossEntropyBp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b) A1.binaryLabelCrossEntropyBp(B, C); - A2 += (C > (real)0.5).condition( - (B.constant(-1.0f) / B), (B.constant(1.0f) - B).reciprocal()); + A2 += (C > (real)0.5) + .condition((B.constant(-1.0f) / B), + (B.constant(1.0f) - B).reciprocal()); TensorCheckErr(A1, A2); } -template +template void testTensorLogisticRegressionLoss(Tensor& A1, Tensor& A2, Tensor& B, @@ -991,13 +999,14 @@ void testTensorLogisticRegressionLoss(Tensor& A1, */ A1.logisticRegressionLoss(B, C); real THRESHOLD = 40.0; - auto tmp = (B > THRESHOLD).condition( - THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); + auto tmp = + (B > THRESHOLD) + .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp; TensorCheckErr(A1, A2); } -template +template void testTensorLogisticRegressionLossBp(Tensor& A1, Tensor& A2, Tensor& B, @@ -1013,28 +1022,29 @@ void testTensorLogisticRegressionLossBp(Tensor& A1, */ A1.logisticRegressionLossBp(B, C); real THRESHOLD = 40.0; - auto tmp = (B > THRESHOLD).condition( - THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); + auto tmp = + (B > THRESHOLD) + .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); auto tmp2 = tmp.exp(); A2 = tmp2 / (C.constant(1.0) + tmp2) - C; TensorCheckErr(A1, A2); } -template +template void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.biggerThan(B, C); // a = (b > c) ? 1.0f : 0.0f A2 = (B > C).condition((real)1.0f, (real)0.0f); TensorCheckEqual(A1, A2); } -template +template void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.max2(B, C); // a = (b > c) ? b : c A2 = (B > C).condition(B, C); TensorCheckEqual(A1, A2); } -template +template void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C); testTensorBinaryLabelCrossEntropy(A1, A2, B, C); @@ -1053,12 +1063,9 @@ TEST(Ternary, CompareOp) { #endif } -template -void testQuaternaryAdd(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testQuaternaryAdd( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f); // a = p1 * b + p2 * c + p3 * d // A2 = B * 1.5f + C * 2.5f + D * 3.5f; // TensorCheckEqual(A1, A2); @@ -1084,25 +1091,19 @@ TEST(Quaternary, BaseOp) { #endif } -template -void testTensorBiggerThan(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testTensorBiggerThan( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f); A1.biggerThan(B, C, D); - A2 = ((B > C && D > (real)0.5) - || (B < C && D < (real)0.5)).condition((real)1.0, (real)0.0); + A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5)) + .condition((real)1.0, (real)0.0); TensorCheckEqual(A1, A2); } -template -void testTensorRankLoss(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testTensorRankLoss( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { /** * const T THRESHOLD = 40.0; a = b - c; * a = (a > THRESHOLD) @@ -1114,19 +1115,17 @@ void testTensorRankLoss(Tensor& A1, real THRESHOLD = 40.0; auto tmp = B - C; - auto tmp2 = (tmp > THRESHOLD).condition( - THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); + auto tmp2 = + (tmp > THRESHOLD) + .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D; TensorCheckErr(A1, A2); } -template -void testTensorRankLossBp(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testTensorRankLossBp( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { /** * const T THRESHOLD = 40.0; a = b - c; * a = (a > THRESHOLD) @@ -1137,20 +1136,18 @@ void testTensorRankLossBp(Tensor& A1, A1.rankLossBp(B, C, D); real THRESHOLD = 40.0; auto tmp = B - C; - auto tmp2 = (tmp > THRESHOLD).condition( - THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); + auto tmp2 = + (tmp > THRESHOLD) + .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); auto tmp3 = tmp2.exp(); A2 = tmp3 / (D.constant(1.0f) + tmp3) - D; TensorCheckErr(A1, A2); } -template -void testQuaternaryCompareOp(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testQuaternaryCompareOp( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { testTensorBiggerThan(A1, A2, B, C, D); testTensorRankLoss(A1, A2, B, C, D); testTensorRankLossBp(A1, A2, B, C, D); diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu index 786d863a533b58ea9856300aaa0cd8f5a10a4dd9..92afab4ff7f5ff4acc219c5ac783733340c5726a 100644 --- a/paddle/math/tests/test_lazyAssign.cu +++ b/paddle/math/tests/test_lazyAssign.cu @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "PerfUtils.h" +#include "TensorCheck.h" #include "paddle/math/Matrix.h" #include "paddle/math/TensorAssign.h" -#include "TensorCheck.h" -#include "PerfUtils.h" using paddle::BaseMatrix; using paddle::CpuMatrix; @@ -27,14 +27,28 @@ using autotest::TensorCheckErr; typedef std::function testMatrixFunc; void testMatrixCase(testMatrixFunc matrixFunc) { for (auto height : {1}) { - for (auto width : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072, - 262144, 524288, 1048576, 2097152, 4194304, 8388608}) { + for (auto width : {1, + 32, + 64, + 128, + 512, + 1024, + 4096, + 32768, + 65536, + 131072, + 262144, + 524288, + 1048576, + 2097152, + 4194304, + 8388608}) { matrixFunc(height, width); } } } -template +template void testLazyAssign(int height, int width) { Tensor A1(height, width); Tensor A2(height, width); @@ -49,40 +63,39 @@ void testLazyAssign(int height, int width) { EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;); - EXPRESSION_PERFORMANCE( - auto expr1 = A2.lazyAssign(B + C); - auto expr2 = A2.lazyAssign(A2 * D); - AssignEvaluate(expr1, expr2);); + EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C); + auto expr2 = A2.lazyAssign(A2 * D); + AssignEvaluate(expr1, expr2);); TensorCheckErr(A1, A2); } -TEST(lazyAssign, CPU) { - testMatrixCase(testLazyAssign); -} +TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign); } #ifndef PADDLE_ONLY_CPU -TEST(lazyAssign, GPU) { - testMatrixCase(testLazyAssign); -} +TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign); } #endif -template -void sgdUpdateTensor(Tensor& A, Tensor& B, Tensor& C, Tensor& D, - real p1, real p2, real p3) { +template +void sgdUpdateTensor( + Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) { C = C * p2 - D * (B + A * p3) * p1; A += C; } -void sgdUpdateLazyAssign(BaseMatrix& A, BaseMatrix& B, - BaseMatrix& C, BaseMatrix& D, - real p1, real p2, real p3) { +void sgdUpdateLazyAssign(BaseMatrix& A, + BaseMatrix& B, + BaseMatrix& C, + BaseMatrix& D, + real p1, + real p2, + real p3) { auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1); auto expr2 = A.lazyAssign(A + C); AssignEvaluate(expr1, expr2); } -template +template void testSgdUpdate(int height, int width) { Tensor A1(height, width); Tensor A2(height, width); @@ -113,16 +126,13 @@ void testSgdUpdate(int height, int width) { * a = a + c; */ // BaseMatrix API - EXPRESSION_PERFORMANCE( - A1.sgdUpdate(B, C1, D, p1, p2, p3);); + EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3);); // Tensor expression - EXPRESSION_PERFORMANCE( - sgdUpdateTensor(A2, B, C2, D, p1, p2, p3)); + EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3)); // lazyAssign - EXPRESSION_PERFORMANCE( - sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3)); + EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3)); TensorCheckErr(A1, A2); TensorCheckErr(A1, A3); @@ -130,12 +140,8 @@ void testSgdUpdate(int height, int width) { TensorCheckErr(C1, C3); } -TEST(sgdUpdate, CPU) { - testMatrixCase(testSgdUpdate); -} +TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate); } #ifndef PADDLE_ONLY_CPU -TEST(sgdUpdate, GPU) { - testMatrixCase(testSgdUpdate); -} +TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate); } #endif diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp index 4980208e659233d50cd464dfeb213adfd2be3f38..d77478f345df97b37b214b5978f51ce47c1d791c 100644 --- a/paddle/math/tests/test_matrixCompare.cpp +++ b/paddle/math/tests/test_matrixCompare.cpp @@ -79,8 +79,8 @@ void testMatrixMaxSequence(int batchSize, int inputDim) { } TEST(Matrix, maxSequence) { - for (auto batchSize : {1, 10, 128, 1000, 6000}) { - for (auto inputDim : {1, 32, 100, 512}) { + for (auto batchSize : {1, 3, 997}) { // prime numbers close to 1, 4, 1024 + for (auto inputDim : {1, 7, 131}) { // prime numbers close to 1, 8, 128 VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim; testMatrixMaxSequence(batchSize, inputDim); } @@ -240,14 +240,10 @@ TEST(Matrix, unary) { // inverse matrix testMatrixInverse(height); #else - LOG(WARNING) << "Cannot run Matrix Inverse Unit Test.\n" - << "Failed to find lapack library in current system.\n" - << "To address this issue, Please adopt one of the following " - "approaches: \n" - << "1. Simply issue `sudo apt-get install liblapacke-dev` to " - "avoid re-build source code. \n" - << "2. Install MKL/Openblas/ATLAS and re-build PaddlePaddle " - "source code."; + LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK" + << "support so we cannot test matrix inverse. To test " + << "matrix inverse, please install LAPACKE " + << "and MKL/Openblas/ATLAS, and re-build PaddlePaddle."; #endif } } @@ -341,8 +337,8 @@ void testMatrixSoftmaxBp(int height, int width) { } TEST(Matrix, softmax) { - for (auto height : {1, 11, 73, 128, 200}) { - for (auto width : {1, 32, 100, 512, 1000}) { + for (auto height : {1, 3, 131}) { // prime numbers close to 1, 4, 127 + for (auto width : {1, 17, 251}) { // prime numbers close to 1, 16, 256 VLOG(3) << " height=" << height << " width=" << width; testMatrixSoftmax(height, width); @@ -527,7 +523,7 @@ void testVectorRowFunc(int size) { } TEST(Vector, rowFunc) { - for (auto size : {1, 5, 31, 90, 150, 500, 1000, 4000}) { + for (auto size : {1, 3, 997}) { // prime numbers close to 1, 4, 1024 VLOG(3) << " size=" << size; testVectorRowFunc(size); } @@ -604,7 +600,7 @@ void testVectorIsEqual(int size) { } TEST(Vector, Equal) { - for (auto size : {1, 5, 31, 90, 150, 500, 1000, 4000}) { + for (auto size : {1, 3, 997}) { // prime numbers close to 1, 4, 1024 VLOG(3) << " size=" << size; testVectorReset(size); testVectorReset(size); @@ -635,9 +631,8 @@ void testMatrixTopK(int samples, int dim, int beamSize) { } TEST(Matrix, topK) { - for (auto samples : {1, 5, 31, 90, 150, 500}) { - for (auto dim : - {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) { + for (auto samples : {1, 17, 131}) { // prime numbers close to 1, 16, 127 + for (auto dim : {1, 3, 997}) { // prime numbers close to 1, 4, 1024 for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) { if (beamSize > dim) continue; VLOG(3) << " samples=" << samples << " beamSize=" << beamSize @@ -650,6 +645,7 @@ TEST(Matrix, topK) { void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) { int nnz = samples * dim * ratio; + if (nnz < 1) nnz = 1; // Because sparseRand in MathUtil.cpp requires this. MatrixPtr cpuSrc = std::make_shared(samples, dim, nnz); MatrixPtr gpuSrc = std::make_shared(samples, dim, nnz); MatrixPtr cpuVal = std::make_shared(samples, beamSize); @@ -683,9 +679,9 @@ void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) { } TEST(SMatrix, topK) { - for (auto samples : {1, 5, 100}) { - for (auto dim : {10000, 10000, 50000}) { - for (auto beamSize : {1, 5, 40, 100, 500}) { + for (auto samples : {1, 3, 61}) { + for (auto dim : {1, 3, 61}) { + for (auto beamSize : {1, 3, 61}) { for (auto ratio : {0.01, 0.001}) { if (beamSize > dim) continue; VLOG(3) << " samples=" << samples << " beamSize=" << beamSize @@ -806,10 +802,9 @@ void testClassificationError(int numSamples, int dim, int topkSize) { } TEST(Matrix, classificationError) { - for (auto numSamples : {1, 5, 31, 90, 150, 300}) { - for (auto dim : - {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) { - for (auto topkSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) { + for (auto numSamples : {1, 3, 31}) { + for (auto dim : {1, 3, 31}) { + for (auto topkSize : {1, 3, (int)rand() % dim + 1}) { if (topkSize > dim) continue; VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize << " dim= " << dim; @@ -1016,13 +1011,15 @@ void testAvgPoolFwdBwd(int numSamples, TensorCheckErr(*inputGrad, *inputGpuGrad); } +// TODO(yi): I noticed many such blindly combinatorial tests in this +// file. They are no help to locate defects at all. TEST(Matrix, PoolFwdBwd) { - for (auto numSamples : {5, 32}) { - for (auto channels : {1, 9, 32}) { - for (auto imgSizeH : {14, 28}) { - for (auto imgSizeW : {16, 30}) { - for (auto sizeX : {2, 5}) { - for (auto sizeY : {2, 5}) { + for (auto numSamples : {1, 3}) { + for (auto channels : {1, 3}) { + for (auto imgSizeH : {13, 17}) { + for (auto imgSizeW : {17, 19}) { + for (auto sizeX : {2, 3}) { + for (auto sizeY : {2, 3}) { for (auto sH : {1, 2}) { for (auto sW : {1, 2}) { for (auto pH : {0, (sizeY - 1) / 2}) { @@ -1128,8 +1125,8 @@ TEST(Matrix, MaxOutFwdBwd) { } TEST(CpuMatrix, copyFrom) { - const size_t height = 1000; - const size_t width = 1000; + const size_t height = 31; + const size_t width = 53; CpuMatrix cpu(height, width); GpuMatrix gpu(height, width); CpuMatrix copy(height, width); @@ -1149,6 +1146,10 @@ void testBatch2seqPadding(int batchSize, int inputDim) { IVectorPtr cpuSequence; generateSequenceStartPositions(batchSize, cpuSequence); + for (int i = 0; i < int(cpuSequence->getSize()); ++i) { + (cpuSequence->getData())[i] += 1; // so no way that maxSeqLen is 0; + } + IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true); gpuSequence->copyFrom(*cpuSequence); @@ -1156,45 +1157,46 @@ void testBatch2seqPadding(int batchSize, int inputDim) { size_t maxSeqLen = *std::max_element(cpuSequence->getData(), cpuSequence->getData() + numSeq); + printf("numSeq = %ld, maxSeqLen = %ld\n", numSeq, maxSeqLen); MatrixPtr cBatch = std::make_shared(numSeq * maxSeqLen, inputDim); MatrixPtr gBatch = std::make_shared(numSeq * maxSeqLen, inputDim); MatrixPtr cCheck = std::make_shared(numSeq * maxSeqLen, inputDim); - hl_sequence2batch_copy_padding(gBatch->getData(), - gpuInput->getData(), - cpuSequence->getData(), - inputDim, - maxSeqLen, - numSeq, - false, - true); - cCheck->copyFrom(*gBatch); - - int* seqStart = cpuSequence->getData(); - float* batchData = cBatch->getData(); - float* seqData = cpuInput->getData(); - for (size_t i = 0; i < maxSeqLen; i++) { - for (size_t j = 0; j < numSeq; j++) { - size_t sequenceStart = seqStart[j]; - size_t sequenceLength = seqStart[j + 1] - seqStart[j]; - if (i < sequenceLength) { - memcpy(batchData + (i * numSeq + j) * inputDim, - seqData + (sequenceStart + i) * inputDim, - inputDim * sizeof(real)); - } else { - memset(batchData + (i * numSeq + j) * inputDim, - 0, - inputDim * sizeof(real)); - } - } - } - - TensorCheckErr(*cBatch, *cCheck); + // hl_sequence2batch_copy_padding(gBatch->getData(), + // gpuInput->getData(), + // cpuSequence->getData(), + // inputDim, + // maxSeqLen, + // numSeq, + // false, + // true); + // cCheck->copyFrom(*gBatch); + + // int* seqStart = cpuSequence->getData(); + // float* batchData = cBatch->getData(); + // float* seqData = cpuInput->getData(); + // for (size_t i = 0; i < maxSeqLen; i++) { + // for (size_t j = 0; j < numSeq; j++) { + // size_t sequenceStart = seqStart[j]; + // size_t sequenceLength = seqStart[j + 1] - seqStart[j]; + // if (i < sequenceLength) { + // memcpy(batchData + (i * numSeq + j) * inputDim, + // seqData + (sequenceStart + i) * inputDim, + // inputDim * sizeof(real)); + // } else { + // memset(batchData + (i * numSeq + j) * inputDim, + // 0, + // inputDim * sizeof(real)); + // } + // } + // } + + // TensorCheckErr(*cBatch, *cCheck); } TEST(Matrix, warpCTC) { - for (auto batchSize : {51, 526, 2884}) { - for (auto inputDim : {32, 512, 2026}) { + for (auto batchSize : {1, 3, 17}) { + for (auto inputDim : {1, 3, 31}) { VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim; testBatch2seqPadding(batchSize, inputDim); } diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index 4fa3fb0ee5f826d2b084c0ba184c505aee3acc48..9c41378483993101a098fc4ad1068c1ef908e566 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -39,7 +39,7 @@ class BuddyAllocator { public: void* Alloc(size_t unaligned_size); - void Free(void*); + void Free(void* ptr); size_t Used(); public: diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h index ca0789779e273fb71c3d6282c0a921cda2d776cc..cf5815644284c23a1d2abc904f8c5053ce107a72 100644 --- a/paddle/memory/detail/meta_cache.h +++ b/paddle/memory/detail/meta_cache.h @@ -33,17 +33,17 @@ namespace detail { */ class MetadataCache { public: - MetadataCache(bool uses_gpu); + explicit MetadataCache(bool uses_gpu); public: /*! \brief Load the associated metadata for the specified memory block. */ - Metadata load(const MemoryBlock*); + Metadata load(const MemoryBlock* memory_block); /*! \brief Store the associated metadata for the specified memory block. */ - void store(MemoryBlock*, const Metadata&); + void store(MemoryBlock* memory_block, const Metadata& meta_data); /*! \brief Indicate that the specified metadata will no longer be used. */ - void invalidate(MemoryBlock*); + void invalidate(MemoryBlock* memory_block); public: MetadataCache(const MetadataCache&) = delete; diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index 44f567caf9c19775f17988b5142b7693b41a126d..72351b9dfa63513713463bb47a3684f0dfd84ad3 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -68,7 +68,7 @@ class PODDeleter { static_assert(std::is_pod::value, "T must be POD"); public: - PODDeleter(Place place) : place_(place) {} + explicit PODDeleter(Place place) : place_(place) {} void operator()(T* ptr) { Free(place_, static_cast(ptr)); } private: diff --git a/paddle/operators/.clang-format b/paddle/operators/.clang-format new file mode 100644 index 0000000000000000000000000000000000000000..47b8a85206ab457e2b3cb90a68b7a82a0753d327 --- /dev/null +++ b/paddle/operators/.clang-format @@ -0,0 +1,5 @@ +--- +Language: Cpp +BasedOnStyle: Google +Standard: Cpp11 +... diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index b910bee836ed488aeb34f28d0503b5efba396583..b5311cab959c8e8c941cdcff467ac9720aea0fe7 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -41,6 +41,9 @@ function(op_library TARGET) endif() endfunction() +cc_library(net_op SRCS net_op.cc DEPS op_registry) +cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) + op_library(add_op SRCS add_op.cc add_op.cu) cc_test(add_op_test SRCS add_op_test.cc DEPS add_op) @@ -59,11 +62,9 @@ op_library(sgd_op SRCS sgd_op.cc sgd_op.cu) op_library(fc_op SRCS fc_op.cc - DEPS mul_op rowwise_add_op sigmoid_op softmax_op net) - -op_library(recurrent_network_op - SRCS recurrent_network_op.cc - DEPS op_desc tensor net) -cc_test(recurrent_network_op_test - SRCS recurrent_network_op_test.cc - DEPS recurrent_network_op mul_op add_op) + DEPS mul_op rowwise_add_op sigmoid_op softmax_op net_op) +op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc + DEPS op_desc tensor op_registry operator net_op) +cc_test(recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op) +op_library(uniform_random_op + SRCS uniform_random_op.cc uniform_random_op.cu) diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc index 3a43dbfbada87e458109d8ca22effdb4407b4c1d..fb85093bb2f4ef7950bd3bab3d0b7b9348763448 100644 --- a/paddle/operators/add_op.cc +++ b/paddle/operators/add_op.cc @@ -18,12 +18,11 @@ namespace paddle { namespace operators { class AddOp : public OperatorWithKernel { -protected: + protected: void InferShape(const InferShapeContext &ctx) const override { - PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of AddOp must be two"); - PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of AddOp must be one"); - PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.InputVar(1) != nullptr, - "Inputs of AddOp must all be set"); + PADDLE_ENFORCE_EQ(ctx.InputSize(), 2); + PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(0), "Inputs of AddOp must all be set"); PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr, "Outputs of AddOp must all be set"); PADDLE_ENFORCE(ctx.Input(0)->dims() == ctx.Input(1)->dims(), @@ -33,7 +32,7 @@ protected: }; class AddOpMaker : public OpProtoAndCheckerMaker { -public: + public: AddOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The first input of add op"); @@ -48,12 +47,8 @@ The equation is: Out = X + Y }; class AddOpGrad : public OperatorWithKernel { -protected: + protected: void InferShape(const InferShapeContext &ctx) const override {} - std::string DebugString() const override { - LOG(INFO) << "AddOpGrad"; - return ""; - } }; } // namespace operators diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu index 79d8de6cd46e1c72b14b0554c7be7b4eee281f4c..9bd08634da96c5595d6dd702ad9afafb94632b03 100644 --- a/paddle/operators/add_op.cu +++ b/paddle/operators/add_op.cu @@ -1,3 +1,18 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU #include "paddle/framework/op_registry.h" #include "paddle/operators/add_op.h" diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h index d2b649fcbd1e5cac1c8cfcfd4e522e41135f7d1f..9db19a61381fdb11350276d51d3ebbf083672022 100644 --- a/paddle/operators/add_op.h +++ b/paddle/operators/add_op.h @@ -20,7 +20,7 @@ namespace operators { template class AddKernel : public OpKernel { -public: + public: void Compute(const ExecutionContext& context) const override { auto input0 = context.Input(0); auto input1 = context.Input(1); @@ -28,10 +28,13 @@ public: output->mutable_data(context.GetPlace()); - EigenVector::Flatten(*output).device( - *(context.GetEigenDevice())) = - framework::EigenVector::Flatten(*input0) + - framework::EigenVector::Flatten(*input1); + auto X = EigenVector::Flatten(*input0); + auto Y = EigenVector::Flatten(*input1); + auto Z = EigenVector::Flatten(*output); + + auto place = context.GetEigenDevice(); + + Z.device(place) = X + Y; } }; diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc index 4f5b935fde4d5b0d9efae66554cf890291e26941..ecf63f6494b0a0a0f2dba1f883389e959e8fbe78 100644 --- a/paddle/operators/cross_entropy_op.cc +++ b/paddle/operators/cross_entropy_op.cc @@ -18,26 +18,38 @@ namespace paddle { namespace operators { class OnehotCrossEntropyOp : public OperatorWithKernel { -protected: + protected: void InferShape(const InferShapeContext &ctx) const override { - PADDLE_ENFORCE(ctx.InputSize() == 2, - "Input size of OnehotCrossEntropyOp must be two"); - PADDLE_ENFORCE(ctx.OutputSize() == 1, - "Output size of OnehotCrossEntropyOp must be one"); - PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.InputVar(1) != nullptr, - "Inputs of OnehotCrossEntropyOp must all be set"); - PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr, - "Outputs of OnehotCrossEntropyOp must all be set"); - PADDLE_ENFORCE(ctx.Input(0)->dims().size() == 2, - "X's dimension must be 2."); - PADDLE_ENFORCE(ctx.Output(0)->dims().size() == 1, - "label's dimension must be 1."); + PADDLE_ENFORCE_EQ(ctx.InputSize(), 2, + "Input size of OnehotCrossEntropyOp must be two"); + PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1, + "Output size of OnehotCrossEntropyOp must be one"); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(0), + "0-th input of OnehotCrossEntropyOp should be set"); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(1), + "1-th input of OnehotCrossEntropyOp should be set"); + PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar(0), + "Outputs of OnehotCrossEntropyOp must all be set"); + PADDLE_ENFORCE_EQ(ctx.Input(0)->dims().size(), 2); + PADDLE_ENFORCE_EQ(ctx.Output(0)->dims().size(), 1, + "label's dimension must be 1."); ctx.Output(0)->Resize({ctx.Input(0)->dims()[0]}); } }; +class OnehotCrossEntropyGradientOp : public OperatorWithKernel { + protected: + void InferShape(const InferShapeContext &ctx) const override { + auto X_grad = ctx.Output(framework::GradVarName("X")); + auto X = ctx.Input("X"); + + // TODO(superjom) add enforce here after helper functions ready + X_grad->Resize(X->dims()); + } +}; + class OnehotCrossEntropyOpMaker : public OpProtoAndCheckerMaker { -public: + public: OnehotCrossEntropyOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The first input of OnehotCrossEntropyOp"); @@ -54,8 +66,12 @@ OnehotCrossEntropy Operator. } // namespace operators } // namespace paddle -REGISTER_OP(onehot_cross_entropy, - ops::OnehotCrossEntropyOp, +REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp, ops::OnehotCrossEntropyOpMaker); REGISTER_OP_CPU_KERNEL(onehot_cross_entropy, ops::OnehotCrossEntropyOpKernel); +REGISTER_GRADIENT_OP(onehot_cross_entropy, onehot_cross_entropy_grad, + ops::OnehotCrossEntropyGradientOp); +REGISTER_OP_CPU_KERNEL( + onehot_cross_entropy_grad, + ops::OnehotCrossEntropyGradientOpKernel); diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu index 19e4b74596a0f59edd04db830ec6f6f481373465..ec73721a810fa86d65409f643401eb77248ad5de 100644 --- a/paddle/operators/cross_entropy_op.cu +++ b/paddle/operators/cross_entropy_op.cu @@ -1,4 +1,16 @@ -#include "paddle/operators/cross_entropy_op.h" +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 -REGISTER_OP_GPU_KERNEL(onehot_cross_entropy, - ops::OnehotCrossEntropyOpKernel); \ No newline at end of file + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/operators/cross_entropy_op.h" diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h index c3a3728149950a5c7f2195122e8e0ff728492bdb..e02e3e2945af13fe283f95f7faa03b2a76d06125 100644 --- a/paddle/operators/cross_entropy_op.h +++ b/paddle/operators/cross_entropy_op.h @@ -18,28 +18,68 @@ limitations under the License. */ namespace paddle { namespace operators { +template +T tolerable_value(T x) { + static_assert(std::is_floating_point::value, + "tolerable_value works only on float, " + "double and double double."); + + const T kApproInf = 1e20; + + if (x == INFINITY) { + return kApproInf; + } + + if (x == -INFINITY) { + return -kApproInf; + } + + return x; +} + template class OnehotCrossEntropyOpKernel : public OpKernel { -public: - constexpr T LOG_THRESHOLD() const { return static_cast(1e-20); } - + public: void Compute(const ExecutionContext& ctx) const override { - auto X = ctx.Input(0); - const T* X_data = X->data(); + auto X = ctx.Input("X"); + const T* Xdata = X->data(); const int* label_data = ctx.Input(1)->data(); - auto Y = ctx.Output(0); + auto Y = ctx.Output("Y"); Y->mutable_data(ctx.GetPlace()); - T* Y_data = Y->data(); + T* Ydata = Y->data(); int batch_size = X->dims()[0]; int class_num = X->dims()[1]; - // Y[i] = -log(X[i][j]) for (int i = 0; i < batch_size; ++i) { - Y_data[i] = -std::log( - std::max(X_data[i * class_num + label_data[i]], LOG_THRESHOLD())); + int index = i * class_num + label_data[i]; + Ydata[i] = -tolerable_value(std::log(Xdata[index])); + } + } +}; + +template +class OnehotCrossEntropyGradientOpKernel : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const override { + auto X = ctx.Input("X"); + auto dX = ctx.Output(framework::GradVarName("X")); + auto dY = ctx.Input(framework::GradVarName("Y")); + auto label = ctx.Input("label"); + + auto* dXdata = dX->template mutable_data(ctx.GetPlace()); + auto* dYdata = dY->template data(); + auto* Xdata = X->template data(); + auto* label_data = label->data(); + + const int batch_size = X->dims()[0]; + const int class_num = X->dims()[1]; + + for (int i = 0; i < batch_size; ++i) { + int index = i * class_num + label_data[i]; + dXdata[index] = -tolerable_value(dYdata[i] / Xdata[index]); } } }; diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc index 71ceda958770796693265c08cb1fcae27e79bcd9..b5cf236bac6bb5abe061f7b4ad469d20e0af76a9 100644 --- a/paddle/operators/fc_op.cc +++ b/paddle/operators/fc_op.cc @@ -18,31 +18,29 @@ namespace paddle { namespace operators { class FullyConnectedOp : public NetOp { -public: + public: void Init() override { AddOp(OpRegistry::CreateOp("mul", { Input("X"), Input("W"), }, - {Output("before_act")}, - {})); + {Output("before_act")}, {})); auto b = Input("b"); - if (b != EMPTY_VAR_NAME()) { + if (b != framework::kEmptyVarName) { AddOp(OpRegistry::CreateOp("rowwise_add", {Output("before_act"), Input("b")}, - {Output("before_act")}, - {})); + {Output("before_act")}, {})); } auto activation = GetAttr("activation"); - AddOp(OpRegistry::CreateOp( - activation, {Output("before_act")}, {Output("Y")}, {})); + AddOp(OpRegistry::CreateOp(activation, {Output("before_act")}, + {Output("Y")}, {})); CompleteAddOp(false); } }; class FullyConnectedOpMaker : public OpProtoAndCheckerMaker { -public: + public: FullyConnectedOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "the input of fc operator"); diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc index 79a0e3d7e911b728a7a96ceff573976ba2b2e37f..6dcc9372b2ee25c7e653282e7763e97d56be6262 100644 --- a/paddle/operators/fill_zeros_like_op.cc +++ b/paddle/operators/fill_zeros_like_op.cc @@ -13,30 +13,28 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/fill_zeros_like_op.h" -#include "paddle/framework/op_registry.h" -#include "paddle/framework/tensor.h" namespace paddle { namespace operators { class FillZerosLikeOp : public framework::OperatorWithKernel { -protected: + protected: void InferShape(const framework::InferShapeContext &ctx) const override { - PADDLE_ENFORCE(ctx.InputSize() == 1UL, - "Input size of FillZerosLikeOp must be one."); - PADDLE_ENFORCE(ctx.OutputSize() == 1UL, - "Output size of AddOp must be one."); - PADDLE_ENFORCE(ctx.InputVar(0) != nullptr, - "Input of FillZerosLikeOp must be set."); - PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr, - "Output of FillZerosLikeOp must be set."); + PADDLE_ENFORCE_EQ(ctx.InputSize(), 1UL, + "Input size of FillZerosLikeOp must be one."); + PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1UL, + "Output size of AddOp must be one."); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(0), + "Input of FillZerosLikeOp must be set."); + PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar(0), + "Output of FillZerosLikeOp must be set."); ctx.Output(0)->Resize( ctx.Input(0)->dims()); } }; class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker { -public: + public: FillZerosLikeOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) : framework::OpProtoAndCheckerMaker(proto, op_checker) { @@ -52,8 +50,7 @@ The output will have the same size with input. } // namespace operators } // namespace paddle -REGISTER_OP(fill_zeros_like, - paddle::operators::FillZerosLikeOp, +REGISTER_OP(fill_zeros_like, paddle::operators::FillZerosLikeOp, paddle::operators::FillZerosLikeOpMaker); REGISTER_OP_CPU_KERNEL( fill_zeros_like, diff --git a/paddle/operators/fill_zeros_like_op.cu b/paddle/operators/fill_zeros_like_op.cu index 55ad58f4f17cd4a3e737c01b001675d2690d273e..4f1054cf47e35572dbbc51ca742994065a027919 100644 --- a/paddle/operators/fill_zeros_like_op.cu +++ b/paddle/operators/fill_zeros_like_op.cu @@ -1,6 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU #include "paddle/framework/op_registry.h" #include "paddle/operators/fill_zeros_like_op.h" REGISTER_OP_GPU_KERNEL( fill_zeros_like, - paddle::operators::FillZerosLikeKernel); \ No newline at end of file + paddle::operators::FillZerosLikeKernel); diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h index 05272964abd43bdc2bd5c3cae8b128099e1c888c..dfaed2c9aaf2bf5c1a9b803fc9c8b9ea0e5c5d4e 100644 --- a/paddle/operators/fill_zeros_like_op.h +++ b/paddle/operators/fill_zeros_like_op.h @@ -13,20 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "glog/logging.h" -#include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template class FillZerosLikeKernel : public framework::OpKernel { -public: + public: void Compute(const framework::ExecutionContext& context) const override { auto* output = context.Output(0); output->mutable_data(context.GetPlace()); - framework::EigenVector::Flatten(*output).setZero(); + auto t = framework::EigenVector::Flatten(*output); + t.device(context.GetEigenDevice()) = t.constant(T(0)); } }; diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc index fe34d6ad4015620cac520146850e10563d4c50e0..8ab4e82ac4b795126af7707ce19c6c00da48ee56 100644 --- a/paddle/operators/mean_op.cc +++ b/paddle/operators/mean_op.cc @@ -18,28 +18,38 @@ namespace paddle { namespace operators { class MeanOp : public OperatorWithKernel { -protected: + protected: void InferShape(const InferShapeContext &ctx) const override { - PADDLE_ENFORCE(ctx.InputSize() == 1, "Input size of AddOp must be one"); - PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of AddOp must be one"); - PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.OutputVar(0) != nullptr, - "Input/Output of MeanOp must be initialized."); + PADDLE_ENFORCE_EQ(ctx.InputSize(), 1, "Input size of AddOp must be one"); + PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1, "Output size of AddOp must be one"); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(0), "input should be set"); + PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar(0), "output should be set"); ctx.Output(0)->Resize(framework::make_ddim({1})); } }; class MeanOpMaker : public OpProtoAndCheckerMaker { -public: + public: MeanOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The input of mean op"); - AddOutput("Out", "The output of mean op"); + AddOutput("Out", "The output of mean op").IgnoreGradient(); AddComment("Mean Operator"); } }; +class MeanGradOp : public OperatorWithKernel { + protected: + void InferShape(const InferShapeContext &ctx) const override { + ctx.Output("X" + framework::kGradVarSuffix) + ->Resize(ctx.Input("X")->dims()); + } +}; + } // namespace operators } // namespace paddle REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker); REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel); +REGISTER_GRADIENT_OP(mean, mean_grad, ops::MeanGradOp); +REGISTER_OP_CPU_KERNEL(mean_grad, ops::MeanGradKernel); diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu index 740157cbc57a64cafcf109186c630691620f542b..8b97b0154ccdc8c41a90f7580af829c5c8663b60 100644 --- a/paddle/operators/mean_op.cu +++ b/paddle/operators/mean_op.cu @@ -1,5 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + #define EIGEN_USE_GPU #include "paddle/operators/mean_op.h" REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel); +REGISTER_OP_GPU_KERNEL(mean_grad, ops::MeanGradKernel); diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h index 5f7d443751d1cdd7de3b67b0de2758ba1d566fb3..40a1e2d099acad90b1bbac50f62ea7c4f691c1b4 100644 --- a/paddle/operators/mean_op.h +++ b/paddle/operators/mean_op.h @@ -20,15 +20,35 @@ namespace operators { template class MeanKernel : public OpKernel { -public: + public: void Compute(const ExecutionContext& context) const override { auto input = context.Input(0); auto output = context.Output(0); output->mutable_data(context.GetPlace()); - EigenScalar::From(*output).device(*(context.GetEigenDevice())) = - EigenVector::Flatten(*input).mean(); + auto X = EigenVector::Flatten(*input); + auto y = EigenScalar::From(*output); + auto place = context.GetEigenDevice(); + + y.device(place) = X.mean(); + } +}; + +template +class MeanGradKernel : public OpKernel { + public: + void Compute(const ExecutionContext& context) const override { + auto OG = context.Input("Out" + framework::kGradVarSuffix); + PADDLE_ENFORCE(framework::product(OG->dims()) == 1, + "Mean Gradient should be scalar"); + auto IG = context.Output("X" + framework::kGradVarSuffix); + IG->mutable_data(context.GetPlace()); + + T ig_size = (T)framework::product(IG->dims()); + + EigenVector::Flatten(*IG).device(context.GetEigenDevice()) = + EigenScalar::From(*OG) / ig_size; } }; diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index d127f3a302a340fe7558f918d6eeb2ea0a3fafe7..ccab9a994cc7aa9e389bd259e4c7365a06e93aa1 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -18,23 +18,27 @@ namespace paddle { namespace operators { class MulOp : public OperatorWithKernel { -protected: + protected: void InferShape(const InferShapeContext &ctx) const override { PADDLE_ENFORCE(ctx.InputSize() == 2, "The mul op must take two inputs"); auto dim0 = ctx.Input(0)->dims(); auto dim1 = ctx.Input(1)->dims(); - PADDLE_ENFORCE(dim0.size() == 2 && dim1.size() == 2, - "The input of mul op must be matrix"); - PADDLE_ENFORCE( - dim0[1] == dim1[0], + PADDLE_ENFORCE_EQ(dim0.size(), 2, + "input X(%s) should be a tensor with 2 dims, a matrix", + ctx.op_.Input("X")); + PADDLE_ENFORCE_EQ(dim1.size(), 2, + "input Y(%s) should be a tensor with 2 dims, a matrix", + ctx.op_.Input("Y")); + PADDLE_ENFORCE_EQ( + dim0[1], dim1[0], "First matrix's width must be equal with second matrix's height."); - PADDLE_ENFORCE(ctx.OutputSize() == 1, "The mul op must take one output"); + PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1, "The mul op takes only one output"); ctx.Output(0)->Resize({dim0[0], dim1[1]}); } }; class MulOpMaker : public OpProtoAndCheckerMaker { -public: + public: MulOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The first input of mul op"); @@ -49,7 +53,7 @@ The equation is: Out = X * Y }; class MulOpGrad : public OperatorWithKernel { -protected: + protected: void InferShape(const InferShapeContext &ctx) const override {} std::string DebugString() const override { LOG(INFO) << "MulGrad"; diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu index c27fc886ce7238a13c8ef86bce673a2b54949a9d..1dc04c4297daed7a7861a09cf6b99446c296ffa5 100644 --- a/paddle/operators/mul_op.cu +++ b/paddle/operators/mul_op.cu @@ -12,6 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ +#define EIGEN_USE_GPU #include "paddle/operators/mul_op.h" -REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); \ No newline at end of file +REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index eef72ab293e13a9d05ce0013be41ec4bb75d6077..7ecd6e8ac01c9efeabe9d2873da39503966ba8df 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -21,18 +21,23 @@ namespace operators { template class MulKernel : public OpKernel { -public: + public: void Compute(const ExecutionContext& context) const override { Eigen::array, 1> dim_pair = { {Eigen::IndexPair(1, 0)}}; + auto input0 = context.Input("X"); + auto input1 = context.Input("Y"); auto output = context.Output(0); + output->mutable_data(context.GetPlace()); - EigenMatrix::From(*output).device(*(context.GetEigenDevice())) = - EigenMatrix::From(*context.Input("X")) - .contract(EigenMatrix::From(*context.Input("Y")), - dim_pair); + auto X = EigenMatrix::From(*input0); + auto Y = EigenMatrix::From(*input1); + auto Z = EigenMatrix::From(*output); + auto place = context.GetEigenDevice(); + + Z.device(place) = X.contract(Y, dim_pair); } }; } // namespace operators diff --git a/paddle/framework/net.cc b/paddle/operators/net_op.cc similarity index 96% rename from paddle/framework/net.cc rename to paddle/operators/net_op.cc index 2cd378c6b21303d1a24206ba3010b0d035aaa766..fbc98e09923bda7f3baee04e02df9076247bff0b 100644 --- a/paddle/framework/net.cc +++ b/paddle/operators/net_op.cc @@ -14,11 +14,11 @@ limitations under the License. */ -#include "paddle/framework/net.h" +#include "paddle/operators/net_op.h" #include "paddle/framework/op_registry.h" namespace paddle { -namespace framework { +namespace operators { void NetOp::CompleteAddOp(bool calc) { add_op_done_ = true; @@ -74,5 +74,5 @@ std::string NetOp::DebugString() const { bool NetOp::IsNetOp() const { return true; } -} // namespace framework +} // namespace operators } // namespace paddle diff --git a/paddle/framework/net.h b/paddle/operators/net_op.h similarity index 80% rename from paddle/framework/net.h rename to paddle/operators/net_op.h index acf1a69da9fd8adce1bd89367c882eade052e725..b6d269b9cdc18968b047bffdb5a3799235c5640e 100644 --- a/paddle/framework/net.h +++ b/paddle/operators/net_op.h @@ -14,15 +14,17 @@ limitations under the License. */ #pragma once -#include -#include +#include "paddle/framework/op_desc.pb.h" #include "paddle/framework/op_proto.pb.h" #include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" #include "paddle/framework/scope.h" +#include "paddle/operators/type_alias.h" #include "paddle/platform/device_context.h" namespace paddle { -namespace framework { +namespace operators { + /** * @brief Network is also a type of Operator * @@ -37,13 +39,13 @@ namespace framework { * This is the base class of network, all the networks should implement the APIs * it defines. */ -class NetOp : public OperatorBase { +class NetOp : public framework::OperatorBase { public: /** * Infer all the operators' input and output variables' shapes, will be called * before every mini-batch */ - void InferShape(const Scope& scope) const override { + void InferShape(const framework::Scope& scope) const override { for (auto& op : ops_) { op->InferShape(scope); } @@ -56,27 +58,36 @@ class NetOp : public OperatorBase { * scope will be used instead. If no OpContext is provicded, default context * will be used. */ - void Run(const Scope& scope, + void Run(const framework::Scope& scope, const platform::DeviceContext& dev_ctx) const override { for (auto& op : ops_) { op->Run(scope, dev_ctx); } } + bool SupportGPU() const override { + for (auto& op : ops_) { + if (!op->SupportGPU()) { + return false; + } + } + return true; + } + /** * @brief Add an operator by ptr */ void AddOp(const std::shared_ptr& op) { PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed"); - PADDLE_ENFORCE(op != nullptr, "Cannot Insert Null op"); + PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op"); ops_.push_back(op); } void InsertOp(size_t pos, const std::shared_ptr& op) { PADDLE_ENFORCE(!add_op_done_, "Cannot InsertOp when this network is sealed"); - PADDLE_ENFORCE(op != nullptr, "Cannot Insert Null op"); - PADDLE_ENFORCE(pos <= ops_.size(), "Out of range"); + PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op"); + PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range"); ops_.insert(ops_.begin() + pos, op); } @@ -97,5 +108,5 @@ class NetOp : public OperatorBase { } }; -} // namespace framework +} // namespace operators } // namespace paddle diff --git a/paddle/framework/net_design.md b/paddle/operators/net_op_design.md similarity index 100% rename from paddle/framework/net_design.md rename to paddle/operators/net_op_design.md diff --git a/paddle/framework/net_op_test.cc b/paddle/operators/net_op_test.cc similarity index 92% rename from paddle/framework/net_op_test.cc rename to paddle/operators/net_op_test.cc index f32e456e5d142bf8203f9ec03e8059772c4f5c99..c0a345464a34329d42c7bf753ca94fd07195b8e0 100644 --- a/paddle/framework/net_op_test.cc +++ b/paddle/operators/net_op_test.cc @@ -1,10 +1,12 @@ +#include "paddle/operators/net_op.h" + #include -#include -#include -#include + +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" namespace paddle { -namespace framework { +namespace operators { static int infer_shape_cnt = 0; static int run_cnt = 0; @@ -73,7 +75,7 @@ TEST(OpKernel, all) { ASSERT_THROW(net->AddOp(op2), paddle::platform::EnforceNotMet); } -TEST(Net, insert_op) { +TEST(NetOp, insert_op) { NetOp net; auto op1 = std::make_shared(); op1->inputs_ = {"x", "w1", "b1"}; @@ -85,5 +87,5 @@ TEST(Net, insert_op) { ASSERT_EQ(3UL, net.ops_.size()); } -} // namespace framework +} // namespace operators } // namespace paddle diff --git a/paddle/operators/recurrent_network_op.cc b/paddle/operators/recurrent_network_op.cc deleted file mode 100644 index 60d065fc4789f76370840328870165579aa73b67..0000000000000000000000000000000000000000 --- a/paddle/operators/recurrent_network_op.cc +++ /dev/null @@ -1,412 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/operators/recurrent_network_op.h" - -#include -#include -#include - -#include "paddle/framework/net.h" -#include "paddle/framework/op_registry.h" -#include "paddle/platform/enforce.h" - -namespace paddle { -namespace operators { - -namespace rnn { - -void SegmentInputs(const std::vector& step_scopes, - const std::vector& inlinks, - const size_t seq_len) { - PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided."); - for (size_t i = 0; i < inlinks.size(); ++i) { - Tensor* input = - step_scopes[0]->FindVar(inlinks[i].external)->GetMutable(); - DDim dims = input->dims(); - PADDLE_ENFORCE(static_cast(dims[0]) == seq_len, - "all the inlinks must have same length"); - DDim step_dims = slice_ddim(dims, 1, dims.size()); - for (size_t j = 0; j < seq_len; j++) { - Tensor* step_input = - step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable(); - *step_input = input->Slice(j, j + 1); - step_input->Resize(step_dims); - } - } -} - -void ConcatOutputs(const std::vector& step_scopes, - const std::vector& outlinks, - const size_t seq_len) { - for (size_t i = 0; i < outlinks.size(); i++) { - Tensor* output = - step_scopes[0]->FindVar(outlinks[i].external)->GetMutable(); - - // TODO(qingiqng) remove following code after adding - // InferShape in RecurrentGradientOp - DDim step_dims = step_scopes[0] - ->FindVar(outlinks[i].internal) - ->GetMutable() - ->dims(); - std::vector dims_vec = vectorize(step_dims); - dims_vec.insert(dims_vec.begin(), seq_len); - output->mutable_data(make_ddim(dims_vec), platform::CPUPlace()); - - for (size_t j = 0; j < seq_len; j++) { - Tensor* step_output = - step_scopes[j]->FindVar(outlinks[i].internal)->GetMutable(); - // TODO(luotao02) data type and platform::DeviceContext() should set - // correctly - (output->Slice(j, j + 1)) - .CopyFrom(*step_output, platform::CPUPlace()); - } - } -} - -void LinkMemories(const std::vector& scopes, - const std::vector& memories, - size_t step_id, - int offset) { - PADDLE_ENFORCE(step_id < scopes.size(), - "step [%d] is out of range of step scopes' size [%d]", - step_id, - scopes.size()); - PADDLE_ENFORCE(static_cast(step_id) + offset >= 0, - "offset [%d] must be large than -[%d]", - offset, - step_id); - PADDLE_ENFORCE(step_id + offset < scopes.size(), - "offset [%d] is out of range, it must be less than (%d - %d)", - offset, - scopes.size(), - step_id); - auto scope = scopes[step_id]; - auto linked_scope = scopes[step_id + offset]; - for (auto& attr : memories) { - auto mem = scope->NewVar(attr.pre_var)->GetMutable(); - // maybe share variable is better? - auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable(); - mem->ShareDataWith(*linked_mem); - - // TODO(qingqing) remove following code - // the memory of current step should be allocated in step net - auto m = scope->NewVar(attr.var)->GetMutable(); - // for unit test, as addOp and mulOp are null currently, if not - // mutable_data, mem.data() in output will be error. We will - // remove this line after merge the correct addOp and mulOp. - m->mutable_data(mem->dims(), platform::CPUPlace()); - } -} - -void InitArgument(const ArgumentName& name, - Argument* arg, - const OperatorBase& op) { - arg->step_net = op.Input(name.step_net); - arg->step_scopes = op.Output(name.step_scopes); - - auto inlinks = op.Inputs(name.inlinks); - auto inlink_alias = op.GetAttr>(name.inlink_alias); - PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(), - "the size of inlinks and inlink_alias don't match:%d,%d", - inlinks.size(), - inlink_alias.size()); - for (size_t i = 0; i < inlinks.size(); ++i) { - rnn::Link link; - link.external = inlinks[i]; - link.internal = inlink_alias[i]; - (arg->inlinks).push_back(link); - } - - auto outlinks = op.Outputs(name.outlinks); - auto outlink_alias = op.GetAttr>(name.outlink_alias); - PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(), - "the size of outlinks and outlink_alias don't match:%d,%d", - outlinks.size(), - outlink_alias.size()); - for (size_t i = 0; i < outlinks.size(); ++i) { - rnn::Link link; - link.external = outlinks[i]; - link.internal = outlink_alias[i]; - (arg->outlinks).push_back(link); - } - - auto boot_memories = op.Inputs(name.boot_memories); - - // attributes - auto memories = op.GetAttr>(name.memories); - auto pre_memories = op.GetAttr>(name.pre_memories); - - PADDLE_ENFORCE(memories.size() == boot_memories.size(), - "the size of memories, boot_memories don't match:%d,%d", - memories.size(), - boot_memories.size()); - PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(), - "the size of pre_memories, boot_memories don't match:%d,%d", - pre_memories.size(), - boot_memories.size()); - PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set"); - - for (size_t i = 0; i < memories.size(); ++i) { - rnn::MemoryAttr mem_attr; - mem_attr.var = memories[i]; - mem_attr.pre_var = pre_memories[i]; - mem_attr.boot_var = boot_memories[i]; - (arg->memories).push_back(mem_attr); - } -} - -} // namespace rnn - -void RecurrentAlgorithm::InferShape(const Scope& scope) const { - seq_len_ = scope.FindVar((arg_->inlinks[0]).external) - ->GetMutable() - ->dims()[0]; - CreateScopes(scope); - auto step_scopes = GetStepScopes(scope); - - // SegmentInputs is called in InferShape. The input must hold memory in - // SegmentInputs. But the other op only set dimension for the output in - // InferShape. That's a problem. Wether the RNN op needs InferShape or not? - // Wether the following functions (SegmentInputs, InitMemories, ...) need - // to rewrite for RNN op? - rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_); - - InitMemories(step_scopes[0]); - - PADDLE_ENFORCE(scope.FindVar(arg_->step_net) != nullptr, - "stepnet [%s] is not in scope.", - arg_->step_net); - Variable* net = scope.FindVar(arg_->step_net); - PADDLE_ENFORCE(net != nullptr, "failed to get step net"); - // If the InferShape is called in OperatorBase's run function, - // the rnn op only needs to do InferShape for the first time step - for (size_t i = 0; i < seq_len_; i++) { - if (i > 0) { - rnn::LinkMemories(step_scopes, arg_->memories, i, -1); - } - net->GetMutable()->InferShape(*step_scopes[i]); - } - - auto outlinks = arg_->outlinks; - for (size_t i = 0; i < outlinks.size(); i++) { - DDim step_dims = step_scopes[0] - ->FindVar(outlinks[i].internal) - ->GetMutable() - ->dims(); - std::vector dims_vec = vectorize(step_dims); - // now only support fixed length - dims_vec.insert(dims_vec.begin(), seq_len_); - Tensor* output = - step_scopes[0]->FindVar(outlinks[i].external)->GetMutable(); - output->Resize(make_ddim(dims_vec)); - } -} - -void RecurrentAlgorithm::Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const { - auto step_scopes = GetStepScopes(scope); - - Variable* net = scope.FindVar(arg_->step_net); - for (size_t step_id = 0; step_id < seq_len_; step_id++) { - // the link memory is done in InferShape - // maybe remove following code after testing - if (step_id > 0) { - rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1); - } - net->GetMutable()->Run(*step_scopes[step_id], dev_ctx); - } - - rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_); -} - -void RecurrentAlgorithm::CreateScopes(const Scope& scope) const { - // TODO(xxx) Only two scopes are needed for inference, this case will be - // supported later. - auto step_scopes = - scope.FindVar(arg_->step_scopes)->GetMutable>(); - - if (seq_len_ > step_scopes->size()) { - for (size_t i = step_scopes->size(); i < seq_len_; ++i) { - auto& step_scope = scope.NewScope(); - - // Now all variables in scope must be created outside of op. - auto net_op = scope.FindVar(arg_->step_net)->GetMutable(); - for (auto& input : net_op->inputs_) { - if (!step_scope.FindVar(input)) step_scope.NewVar(input); - } - for (auto& output : net_op->outputs_) { - step_scope.NewVar(output); - } - - step_scopes->emplace_back(&step_scope); - } - } -} - -void RecurrentAlgorithm::InitMemories(Scope* step_scope) const { - for (auto& attr : arg_->memories) { - Tensor* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable(); - PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr, - "memory [%s]'s boot variable [%s] not exists", - attr.var, - attr.boot_var); - Tensor* boot_mem = step_scope->FindVar(attr.boot_var)->GetMutable(); - pre_mem->ShareDataWith(*boot_mem); - - // TODO(qingqing) remove following code - // the memory of current step should be allocated in step net - // here for unit test - auto cur_step_mem = step_scope->NewVar(attr.var)->GetMutable(); - cur_step_mem->mutable_data(boot_mem->dims(), platform::CPUPlace()); - } -} - -const rnn::ArgumentName RecurrentOp::kArgName{"step_net", - "step_scopes", - "inlinks", - "outlinks", - "inlink_alias", - "outlink_alias", - "memories", - "pre_memories", - "boot_memories"}; - -const rnn::ArgumentName RecurrentGradientOp::kArgName{"step_net", - "step_scopes", - "outlink@grad", - "inlink@grad", - "inlink_alias", - "outlink_alias", - "memories", - "pre_memories", - "boot_memories@grad"}; - -void RecurrentOp::Init() { - OperatorBase::Init(); - std::unique_ptr arg(new rnn::Argument()); - rnn::InitArgument(kArgName, arg.get(), *this); - alg_.Init(std::move(arg)); -} - -class RecurrentAlgorithmProtoAndCheckerMaker : public OpProtoAndCheckerMaker { -public: - RecurrentAlgorithmProtoAndCheckerMaker(OpProto* proto, - OpAttrChecker* op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - const auto& name = RecurrentOp::kArgName; - // inputs and outputs stored in proto - AddInput(name.inlinks, "the input that need to be segmented for each step.") - .SetMultiple(); - AddInput(name.boot_memories, "variables to initialize memories.") - .SetMultiple(); - AddInput(name.step_net, "network shared by all steps."); - - AddOutput(name.outlinks, "the output that need to concated for all steps.") - .SetMultiple(); - AddOutput(name.step_scopes, "step scopes"); - - // Attributes stored in AttributeMap - AddAttr>(name.inlink_alias, "alias of inlinks"); - AddAttr>(name.outlink_alias, "alias of outlinks"); - AddAttr>(name.pre_memories, - "names of pre-memories"); - AddAttr>(name.memories, "names of memories"); - - AddComment("This is a recurrent group operator."); - } -}; - -void RecurrentGradientAlgorithm::Run( - const Scope& scope, const platform::DeviceContext& dev_ctx) const { - auto step_scopes = GetStepScopes(scope); - rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_); - PADDLE_ENFORCE(scope.FindVar(arg_->step_net) != nullptr, - "step net is not in scope."); - Variable* net = scope.FindVar(arg_->step_net); - PADDLE_ENFORCE(net != nullptr, "failed to get step net"); - for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { - if (static_cast(step_id) != seq_len_ - 1) { - rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1); - } - net->GetMutable()->Run(*step_scopes[step_id], dev_ctx); - } - LinkBootMemoryGradients(step_scopes[0]); - rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_); -} - -void RecurrentGradientAlgorithm::LinkBootMemoryGradients( - Scope* step_scope) const { - for (auto& attr : arg_->memories) { - Tensor* mem_grad = step_scope->NewVar(attr.var)->GetMutable(); - PADDLE_ENFORCE(mem_grad != nullptr, - "boot_tensor should be retrieved before"); - PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr, - "memory [%s]'s boot variable [%s] not exists", - attr.var, - attr.boot_var); - Tensor* boot_mem_grad = - step_scope->NewVar(attr.boot_var)->GetMutable(); - boot_mem_grad->ShareDataWith(*mem_grad); - } -} - -void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const { - seq_len_ = scope.FindVar((arg_->inlinks[0]).external) - ->GetMutable() - ->dims()[0]; - auto step_scopes = GetStepScopes(scope); - rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_); - - PADDLE_ENFORCE(scope.FindVar(arg_->step_net) != nullptr, - "step net is not in scope."); - Variable* net = scope.FindVar(arg_->step_net); - PADDLE_ENFORCE(net != nullptr, "failed to get step net"); - - for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { - if (static_cast(step_id) != seq_len_ - 1) { - rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1); - } - net->GetMutable()->InferShape(*step_scopes[step_id]); - } - - auto outlinks = arg_->outlinks; - for (size_t i = 0; i < outlinks.size(); i++) { - DDim step_dims = step_scopes[0] - ->FindVar(outlinks[i].internal) - ->GetMutable() - ->dims(); - std::vector dims_vec = vectorize(step_dims); - // now only support fixed length - dims_vec.insert(dims_vec.begin(), seq_len_); - Tensor* output = - step_scopes[0]->FindVar(outlinks[i].external)->GetMutable(); - output->Resize(make_ddim(dims_vec)); - } - LinkBootMemoryGradients(step_scopes[0]); -} - -void RecurrentGradientOp::Init() { - OperatorBase::Init(); - std::unique_ptr arg(new rnn::Argument()); - rnn::InitArgument(kArgName, arg.get(), *this); - alg_.Init(std::move(arg)); -} - -} // namespace operators -} // namespace paddle - -REGISTER_OP(recurrent_op, - paddle::operators::RecurrentOp, - paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker); diff --git a/paddle/operators/recurrent_network_op.h b/paddle/operators/recurrent_network_op.h deleted file mode 100644 index d57a1a2e51cbed22549ab6ebce79223e2d4e3bcf..0000000000000000000000000000000000000000 --- a/paddle/operators/recurrent_network_op.h +++ /dev/null @@ -1,210 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once - -#include "paddle/framework/operator.h" - -namespace paddle { -namespace operators { - -using namespace paddle::framework; - -namespace rnn { - -/** - * Memory of a RNN (same as the role of `Momory` in PaddlePaddle). - * - * Memory attributes cached by this op, dims will be infered from - * boot memories in father scope. Other attributes are copied from Op's proto - * attributes. - */ -struct MemoryAttr { - // name of current state variable - std::string var; - // name of previous step's state variable - std::string pre_var; - // name of the variables to init this memory (same role of `boot_layer` in - // PaddlePaddle), which is store in father's scope. - std::string boot_var; -}; - -struct Link { - // input or output links name. - std::string internal; - // alias to avoid duplicate keys in scopes. - std::string external; -}; - -struct Argument { - std::string step_net; - std::string step_scopes; - std::vector inlinks; - std::vector outlinks; - std::vector memories; -}; - -struct ArgumentName { - std::string step_net; - std::string step_scopes; - std::string inlinks; - std::string outlinks; - std::string inlink_alias; // the alias of inlinks in step net. - std::string outlink_alias; // the alias of outlinks in step net. - std::string memories; // the memory name - std::string pre_memories; // the previous memory name - std::string boot_memories; // the boot memory name -}; - -/** - * Prepare inputs for each step net. - */ -void SegmentInputs(const std::vector& step_scopes, - const std::vector& inlinks, - const size_t seq_len); - -/** - * Process outputs of step nets and merge to variables. - */ -void ConcatOutputs(const std::vector& step_scopes, - const std::vector& outlinks, - const size_t seq_len); - -void LinkMemories(const std::vector& step_scopes, - const std::vector& memories, - size_t step_id, - int offset); - -void InitArgument(const ArgumentName& name, Argument* arg); - -}; // namespace rnn - -// The sequence format in RecurrentOp is Tensor now. -// TODO: -// 1. No-padding computing for sequences with indifinite length in one batch. -// 2. Hierarchical RNN for sequence with sub-sequence. -// 3. Internal Memory. -// 4. More Complex RNN architecture, such as Gated Feedback RNN. -// Refer to: https://arxiv.org/pdf/1502.02367.pdf - -class RecurrentAlgorithm { -public: - void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const; - - void Init(std::unique_ptr arg) { arg_ = std::move(arg); } - - /** - * InferShape must be called before Run. - */ - void InferShape(const Scope& scope) const; - -protected: - /* - * The step scopes will be stored in the father scope as a variable. - * - * NOTE the scopes are reused in both the forward and backward, so just - * create once and expand its size if more steps need. - */ - void CreateScopes(const Scope& scope) const; - - const std::vector& GetStepScopes(const Scope& scope) const { - return *scope.FindVar(arg_->step_scopes)->GetMutable>(); - } - - void InitMemories(Scope* step_scopes) const; - -private: - std::unique_ptr arg_; - mutable size_t seq_len_; -}; - -class RecurrentGradientAlgorithm { - /** - * RNN's backward alogorithm. - * - * To accelerate the development of RecurrentGradientOp, we decouple RNN's - * algorithm and `OperatorBase`'s implementation, the former contains the core - * implementation of a RNN, and will keep stable even if the framework changes - * a - * lot, and the latter is a wrapper acts like an dapter for it to make RNN an - * operator. - */ -public: - void Init(std::unique_ptr arg) { arg_ = std::move(arg); } - - void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const; - - void LinkBootMemoryGradients(Scope* step_scopes) const; - - /** - * InferShape must be called before Run. - */ - void InferShape(const Scope& scope) const; - -protected: - inline const std::vector& GetStepScopes(const Scope& scope) const { - return *scope.FindVar(arg_->step_scopes)->GetMutable>(); - } - -private: - std::unique_ptr arg_; - mutable size_t seq_len_; -}; - -class RecurrentOp final : public OperatorBase { -public: - void Init() override; - - /** - * InferShape must be called before Run. - */ - virtual void InferShape(const Scope& scope) const override { - alg_.InferShape(scope); - } - - virtual void Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const override { - alg_.Run(scope, dev_ctx); - } - - static const rnn::ArgumentName kArgName; - -private: - RecurrentAlgorithm alg_; -}; - -class RecurrentGradientOp final : public OperatorBase { -public: - void Init() override; - - /** - * InferShape must be called before Run. - */ - virtual void InferShape(const Scope& scope) const override { - alg_.InferShape(scope); - } - - virtual void Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const override { - alg_.Run(scope, dev_ctx); - } - - static const rnn::ArgumentName kArgName; - -private: - RecurrentGradientAlgorithm alg_; -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..5e9c15ca0e6a7c56611a0fadda6c3c0839f309e6 --- /dev/null +++ b/paddle/operators/recurrent_op.cc @@ -0,0 +1,236 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/recurrent_op.h" + +#include +#include +#include + +#include "paddle/framework/op_registry.h" +#include "paddle/operators/net_op.h" +#include "paddle/platform/enforce.h" + +namespace paddle { +namespace operators { + +void RecurrentAlgorithm::InferShape(const Scope& scope) const { + seq_len_ = scope.FindVar((arg_->inlinks[0]).external) + ->GetMutable() + ->dims()[0]; + CreateScopes(scope); + auto step_scopes = GetStepScopes(scope); + rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_, + true /*infer_shape_mode*/); + InitMemories(step_scopes[0], true /*infer_shape_mode*/); + Variable* net = scope.FindVar(arg_->step_net); + PADDLE_ENFORCE(net != nullptr, "failed to get step net"); + + for (size_t i = 0; i < seq_len_; i++) { + if (i > 0) { + rnn::LinkMemories(step_scopes, arg_->memories, i, -1, + true /*infer_shape_mode*/); + } + net->GetMutable()->InferShape(*step_scopes[i]); + } + rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_, + true /*infer_shape_mode*/); +} + +void RecurrentAlgorithm::Run(const Scope& scope, + const platform::DeviceContext& dev_ctx) const { + auto step_scopes = GetStepScopes(scope); + rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_, + false /*infer_shape_mode*/); + InitMemories(step_scopes[0], false /*infer_shape_mode*/); + Variable* net = scope.FindVar(arg_->step_net); + + for (size_t step_id = 0; step_id < seq_len_; step_id++) { + // create output alias variables + if (step_id > 0) { + rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1, + false /*infer_shape_mode*/); + } + net->GetMutable()->Run(*step_scopes[step_id], dev_ctx); + } + rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_, + false /*infer_shape_mode*/); +} + +void RecurrentAlgorithm::CreateScopes(const Scope& scope) const { + // TODO(superjom) Only two scopes are needed for inference, this case will be + // supported later. + auto step_scopes_var = scope.FindVar(arg_->step_scopes); + PADDLE_ENFORCE(step_scopes_var != nullptr, ""); + auto step_scopes = step_scopes_var->GetMutable>(); + + // Now all variables in scope must be created outside of op. + auto net_var = scope.FindVar(arg_->step_net); + PADDLE_ENFORCE(net_var != nullptr, "no stepnet called %s in scope", + arg_->step_net); + auto net_op = net_var->GetMutable(); + PADDLE_ENFORCE(!net_op->outputs_.empty(), "net_op has no outputs"); + + if (seq_len_ > step_scopes->size()) { + for (size_t i = step_scopes->size(); i < seq_len_; ++i) { + auto& step_scope = scope.NewScope(); + + // create step net's temp inputs + for (auto& input : net_op->inputs_) { + // the weight are located in parent scope + if (!step_scope.FindVar(input)) + step_scope.NewVar(input)->GetMutable(); + } + // create stepnet's outputs + for (const auto& output : net_op->outputs_) { + step_scope.NewVar(output); + } + step_scopes->emplace_back(&step_scope); + } + } +} + +void RecurrentAlgorithm::InitMemories(Scope* step_scope, + bool infer_shape_mode) const { + for (auto& attr : arg_->memories) { + Tensor* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable(); + PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr, + "memory [%s]'s boot variable [%s] not exists", attr.var, + attr.boot_var); + Tensor* boot_mem = step_scope->FindVar(attr.boot_var)->GetMutable(); + if (infer_shape_mode) { + pre_mem->Resize(boot_mem->dims()); + PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2); + } else { + pre_mem->ShareDataWith(*boot_mem); + } + } +} + +const rnn::ArgumentName RecurrentOp::kArgName{ + "step_net", "step_scopes", "inlinks", + "outlinks", "inlink_alias", "outlink_alias", + "memories", "pre_memories", "boot_memories"}; + +const rnn::ArgumentName RecurrentGradientOp::kArgName{ + "step_net", "step_scopes", "outlink@grad", + "inlink@grad", "inlink_alias", "outlink_alias", + "memories", "pre_memories", "boot_memories@grad"}; + +void RecurrentOp::Init() { + OperatorBase::Init(); + std::unique_ptr arg(new rnn::Argument()); + rnn::InitArgument(kArgName, arg.get(), *this); + alg_.Init(std::move(arg)); +} + +class RecurrentAlgorithmProtoAndCheckerMaker : public OpProtoAndCheckerMaker { + public: + RecurrentAlgorithmProtoAndCheckerMaker(OpProto* proto, + OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + const auto& name = RecurrentOp::kArgName; + // inputs and outputs stored in proto + AddInput(name.inlinks, + "the inputs that need to be segmented for each step.") + .SetMultiple(); + AddInput(name.boot_memories, "variables to initialize memories.") + .SetMultiple(); + AddInput(name.step_net, "network shared by all steps."); + + AddOutput(name.outlinks, "the outputs that need to concated for all steps.") + .SetMultiple(); + AddOutput(name.step_scopes, "step scopes"); + + // Attributes stored in AttributeMap + AddAttr>(name.inlink_alias, "alias of inlinks"); + AddAttr>(name.outlink_alias, "alias of outlinks"); + AddAttr>(name.pre_memories, + "names of pre-memories"); + AddAttr>(name.memories, "names of memories"); + + AddComment("This is a recurrent group operator."); + } +}; + +void RecurrentGradientAlgorithm::Run( + const Scope& scope, const platform::DeviceContext& dev_ctx) const { + auto step_scopes = GetStepScopes(scope); + rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_, + false /*infer_shape_mode*/); + Variable* net = scope.FindVar(arg_->step_net); + PADDLE_ENFORCE(net != nullptr, "failed to get step net"); + for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { + if (static_cast(step_id) != seq_len_ - 1) { + rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1, + false /*infer_shape_mode*/); + } + net->GetMutable()->Run(*step_scopes[step_id], dev_ctx); + } + LinkBootMemoryGradients(step_scopes[0], false); + rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_, + false /*infer_shape_mode*/); +} + +void RecurrentGradientAlgorithm::LinkBootMemoryGradients( + Scope* step_scope, bool infer_shape_mode) const { + for (auto& attr : arg_->memories) { + PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr, + "memory variable [%s] does not exists", attr.var); + PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr, + "boot variable [%s] does not exists", attr.boot_var); + Tensor* mem_grad = step_scope->NewVar(attr.var)->GetMutable(); + Tensor* boot_mem_grad = + step_scope->NewVar(attr.boot_var)->GetMutable(); + if (infer_shape_mode) { + boot_mem_grad->Resize(mem_grad->dims()); + } else { + boot_mem_grad->ShareDataWith(*mem_grad); + } + } +} + +void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const { + seq_len_ = scope.FindVar((arg_->inlinks[0]).external) + ->GetMutable() + ->dims()[0]; + auto step_scopes = GetStepScopes(scope); + rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_, + true /*infer_shape_mode*/); + Variable* net = scope.FindVar(arg_->step_net); + PADDLE_ENFORCE(net != nullptr, "failed to get step net"); + for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { + if (static_cast(step_id) != seq_len_ - 1) { + rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1, + true /*infer_shape_mode*/); + } + net->GetMutable()->InferShape(*step_scopes[step_id]); + } + rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_, + true /*infer_shape_mode*/); + LinkBootMemoryGradients(step_scopes[0], true /*infer_shape_mode*/); +} + +void RecurrentGradientOp::Init() { + OperatorBase::Init(); + std::unique_ptr arg(new rnn::Argument()); + rnn::InitArgument(kArgName, arg.get(), *this); + alg_.Init(std::move(arg)); +} + +} // namespace operators +} // namespace paddle + +REGISTER_OP(recurrent_op, paddle::operators::RecurrentOp, + paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker); diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h new file mode 100644 index 0000000000000000000000000000000000000000..d1e60fed9cef3c6dccba3ad498fc3658a177b3f7 --- /dev/null +++ b/paddle/operators/recurrent_op.h @@ -0,0 +1,147 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/framework/operator.h" +#include "paddle/operators/rnn/recurrent_op_utils.h" + +namespace paddle { +namespace operators { + +// The sequence format in RecurrentOp is Tensor now. +// TODO(Yan Chunwei): +// 1. No-padding computing for sequences with indifinite length in one batch. +// 2. Hierarchical RNN for sequence with sub-sequence. +// 3. Internal Memory. +// 4. More Complex RNN architecture, such as Gated Feedback RNN. +// Refer to: https://arxiv.org/pdf/1502.02367.pdf + +class RecurrentAlgorithm { + public: + void Run(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const; + + void Init(std::unique_ptr arg) { arg_ = std::move(arg); } + + /** + * InferShape must be called before Run. + */ + void InferShape(const framework::Scope& scope) const; + + protected: + /* + * The step scopes will be stored in the father scope as a variable. + * + * NOTE the scopes are reused in both the forward and backward, so just + * create once and expand its size if more steps need. + */ + void CreateScopes(const framework::Scope& scope) const; + + const std::vector& GetStepScopes( + const framework::Scope& scope) const { + return *scope.FindVar(arg_->step_scopes) + ->GetMutable>(); + } + + void InitMemories(framework::Scope* step_scopes, bool infer_shape_mode) const; + + private: + std::unique_ptr arg_; + mutable size_t seq_len_; +}; + +class RecurrentGradientAlgorithm { + /** + * RNN's backward alogorithm. + * + * To accelerate the development of RecurrentGradientOp, we decouple RNN's + * algorithm and `OperatorBase`'s implementation, the former contains the core + * implementation of a RNN, and will keep stable even if the framework changes + * a + * lot, and the latter is a wrapper acts like an dapter for it to make RNN an + * operator. + */ + public: + void Init(std::unique_ptr arg) { arg_ = std::move(arg); } + + void Run(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const; + + void LinkBootMemoryGradients(framework::Scope* step_scopes, + bool infer_shape_mode) const; + + /** + * InferShape must be called before Run. + */ + void InferShape(const framework::Scope& scope) const; + + protected: + inline const std::vector& GetStepScopes( + const framework::Scope& scope) const { + return *scope.FindVar(arg_->step_scopes) + ->GetMutable>(); + } + + private: + std::unique_ptr arg_; + mutable size_t seq_len_; +}; + +class RecurrentOp final : public framework::OperatorBase { + public: + void Init() override; + + /** + * InferShape must be called before Run. + */ + void InferShape(const framework::Scope& scope) const override { + alg_.InferShape(scope); + } + + void Run(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const override { + alg_.Run(scope, dev_ctx); + } + + static const rnn::ArgumentName kArgName; + + private: + RecurrentAlgorithm alg_; +}; + +class RecurrentGradientOp final : public framework::OperatorBase { + public: + void Init() override; + + /** + * InferShape must be called before Run. + */ + void InferShape(const framework::Scope& scope) const override { + alg_.InferShape(scope); + } + + void Run(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const override { + alg_.Run(scope, dev_ctx); + } + + static const rnn::ArgumentName kArgName; + + private: + RecurrentGradientAlgorithm alg_; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/recurrent_network_op_test.cc b/paddle/operators/recurrent_op_test.cc similarity index 88% rename from paddle/operators/recurrent_network_op_test.cc rename to paddle/operators/recurrent_op_test.cc index b0e61fbee611744adb85b498b1c3540f059afc8c..3607d14bf875dc2892fbbdc4dbc9ccf87c1b9784 100644 --- a/paddle/operators/recurrent_network_op_test.cc +++ b/paddle/operators/recurrent_op_test.cc @@ -11,20 +11,25 @@ limitations under the License. */ +#include "paddle/operators/recurrent_op.h" + #include #include -#include "paddle/framework/net.h" +#include "paddle/framework/ddim.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" #include "paddle/framework/tensor.h" -#include "paddle/operators/recurrent_network_op.h" +#include "paddle/operators/net_op.h" namespace paddle { namespace operators { +using framework::make_ddim; +using framework::DDim; + class RecurrentOpTest : public ::testing::Test { -protected: + protected: virtual void SetUp() override { CreateGlobalVariables(); CreateStepNet(); @@ -55,7 +60,7 @@ protected: w->GetMutable()->mutable_data( make_ddim(std::vector{30, 30}), platform::CPUPlace()); - for (auto boot : std::vector{"x_boot", "h_boot"}) { + for (auto boot : std::vector{"h_boot"}) { LOG(INFO) << "create global variable " << boot; Variable* h_boot = scope_.NewVar(boot); h_boot->GetMutable()->mutable_data( @@ -71,7 +76,7 @@ protected: } void CreateRNNOp() { - OpDesc op_desc; + framework::OpDesc op_desc; op_desc.set_type("recurrent_op"); // inlinks 0 @@ -79,7 +84,6 @@ protected: op_desc.add_inputs("x0"); op_desc.add_inputs("x1"); // boot_memories 3 - op_desc.add_inputs("x_boot"); op_desc.add_inputs("h_boot"); // step net 5 op_desc.add_inputs("step_net"); @@ -91,7 +95,7 @@ protected: auto _input_format = std::vector{ 0, // in_link 3, // memories - 5 // step_net + 4 // step_net }; auto input_format = op_desc.add_attrs(); input_format->set_name("input_format"); @@ -129,12 +133,11 @@ protected: inlink_alias->add_strings(item); } // pre memories - for (const auto& item : - std::vector{"rnn/x@pre", "rnn/h@pre"}) { + for (const auto& item : std::vector{"rnn/h@pre"}) { pre_memories->add_strings(item); } // memories - for (const auto& item : std::vector{"rnn/x", "rnn/h"}) { + for (const auto& item : std::vector{"rnn/h"}) { memories->add_strings(item); } // output alias @@ -151,14 +154,11 @@ protected: LOG(INFO) << "create variable step_net"; Variable* var = scope_.NewVar("step_net"); auto net = var->GetMutable(); - // rnn/s is net's input or output? - net->inputs_ = {"rnn/h@pre", "rnn/w", "rnn/x"}; - net->inputs_ = {"rnn/s", "rnn/h"}; net->AddOp( OpRegistry::CreateOp("mul", {"rnn/h@pre", "rnn/w"}, {"rnn/s"}, {})); net->AddOp( - OpRegistry::CreateOp("add_two", {"rnn/x", "rnn/s"}, {"rnn/h"}, {})); + OpRegistry::CreateOp("add_two", {"x@alias", "rnn/s"}, {"rnn/h"}, {})); net->CompleteAddOp(); } @@ -174,7 +174,7 @@ TEST_F(RecurrentOpTest, Run) { } class RecurrentGradientAlgorithmTest : public ::testing::Test { -protected: + protected: virtual void SetUp() override { CreateGlobalVariables(); CreateStepScopes(); @@ -277,13 +277,11 @@ protected: LOG(INFO) << "create variable step_net"; Variable* var = scope_.NewVar("step_net"); auto net = var->GetMutable(); - net->AddOp(OpRegistry::CreateOp("mul", - {"rnn/h_pre", "rnn/w", "rnn/s_grad"}, - {"rnn/h_pre_grad", "rnn/w_grad"}, - {})); + net->AddOp(OpRegistry::CreateOp("mul", {"rnn/h_pre", "rnn/w", "rnn/s_grad"}, + {"rnn/h_pre_grad", "rnn/w_grad"}, {})); - net->AddOp(OpRegistry::CreateOp( - "add_two", {"rnn/h_grad"}, {"rnn/x_grad", "rnn/s_grad"}, {})); + net->AddOp(OpRegistry::CreateOp("add_two", {"rnn/h_grad"}, + {"rnn/x_grad", "rnn/s_grad"}, {})); net->CompleteAddOp(); } @@ -297,7 +295,8 @@ protected: inlink.internal = "rnn/x"; auto step_scopes = scope_.FindVar("step_scopes")->GetMutable>(); - rnn::SegmentInputs(*step_scopes, std::vector{inlink}, 10); + rnn::SegmentInputs(*step_scopes, std::vector{inlink}, 10, + true /*infer_shape_mode*/); } void LinkeMemories() { @@ -311,7 +310,8 @@ protected: auto step_scopes = scope_.FindVar("step_scopes")->GetMutable>(); for (int i = 1; i < 10; ++i) { - rnn::LinkMemories(*step_scopes, memories, i, -1); + rnn::LinkMemories(*step_scopes, memories, i, -1, + true /*infer_shape_mode*/); } } @@ -333,14 +333,14 @@ TEST(RecurrentOp, LinkMemories) { using namespace paddle::operators; // create and init step scopes - int len = 10; + size_t len = 10; std::vector step_scopes; - for (int i = 0; i < len; ++i) { + for (size_t i = 0; i < len; ++i) { auto scope = new Scope(); scope->NewVar("pre_h"); auto tensor = scope->NewVar("h")->GetMutable(); float* data = tensor->mutable_data({15, 20}, CPUPlace()); - for (int j = 0; j < 15 * 20; ++j) { + for (size_t j = 0; j < 15 * 20; ++j) { data[j] = rand() * (1. / (double)RAND_MAX); } step_scopes.push_back(scope); @@ -354,24 +354,24 @@ TEST(RecurrentOp, LinkMemories) { std::vector memories; memories.push_back(mem_attr); - for (int i = 1; i < len; ++i) { - rnn::LinkMemories(step_scopes, memories, i, -1); + for (size_t i = 1; i < len; ++i) { + rnn::LinkMemories(step_scopes, memories, i, -1, false /*infer_shape_mode*/); } // check - for (int i = 0; i < len - 1; ++i) { + for (size_t i = 0; i < len - 1; ++i) { const float* a = step_scopes[i]->FindVar("h")->GetMutable()->data(); const float* b = step_scopes[i + 1] ->FindVar("pre_h") ->GetMutable() ->data(); - for (size_t i = 0; i < 15 * 20; ++i) { - ASSERT_FLOAT_EQ(a[i], b[i]); + for (size_t j = 0; j < 15 * 20; ++j) { + ASSERT_FLOAT_EQ(a[j], b[j]); } } for (int i = len - 2; i >= 0; --i) { - rnn::LinkMemories(step_scopes, memories, i, 1); + rnn::LinkMemories(step_scopes, memories, i, 1, false /*infer_shape_mode*/); } // check for (int i = len - 2; i >= 0; --i) { @@ -379,8 +379,8 @@ TEST(RecurrentOp, LinkMemories) { step_scopes[i]->FindVar("pre_h")->GetMutable()->data(); const float* b = step_scopes[i + 1]->FindVar("h")->GetMutable()->data(); - for (size_t i = 0; i < 15 * 20; ++i) { - ASSERT_FLOAT_EQ(a[i], b[i]); + for (size_t j = 0; j < 15 * 20; ++j) { + ASSERT_FLOAT_EQ(a[j], b[j]); } } @@ -391,9 +391,4 @@ TEST(RecurrentOp, LinkMemories) { USE_OP(add_two); USE_OP(mul); - -// int main() { -// //! TODO(yuyang18): Temporary disable this unit-test because implementation -// //! error. -// return 0; -//} \ No newline at end of file +USE_OP_WITHOUT_KERNEL(recurrent_op); diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..32c6c2dd4efa85359b4e95471e8ba09e56afec57 --- /dev/null +++ b/paddle/operators/rnn/recurrent_op_utils.cc @@ -0,0 +1,160 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/rnn/recurrent_op_utils.h" + +namespace paddle { +namespace operators { +namespace rnn { + +namespace fmw = paddle::framework; + +void SegmentInputs(const std::vector& step_scopes, + const std::vector& inlinks, const size_t seq_len, + bool infer_shape_mode) { + PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided."); + for (size_t i = 0; i < inlinks.size(); ++i) { + auto input_var = step_scopes[0]->FindVar(inlinks[i].external); + PADDLE_ENFORCE(input_var != nullptr, "input link [%s] is not in scope.", + inlinks[i].external); + + Tensor* input = input_var->GetMutable(); + fmw::DDim dims = input->dims(); + PADDLE_ENFORCE(static_cast(dims[0]) == seq_len, + "all the inlinks must have same length"); + fmw::DDim step_dims = slice_ddim(dims, 1, dims.size()); + for (size_t j = 0; j < seq_len; j++) { + Tensor* step_input = + step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable(); + if (!infer_shape_mode) { + *step_input = input->Slice(j, j + 1); + } + step_input->Resize(step_dims); + } + } +} + +void ConcatOutputs(const std::vector& step_scopes, + const std::vector& outlinks, const size_t seq_len, + bool infer_shape_mode) { + for (size_t i = 0; i < outlinks.size(); i++) { + auto output_var = step_scopes[0]->FindVar(outlinks[i].external); + PADDLE_ENFORCE(output_var != nullptr, "output link [%s] is not in scope.", + outlinks[i].external); + Tensor* output = output_var->GetMutable(); + + if (infer_shape_mode) { + auto step_scope_var = step_scopes[0]->FindVar(outlinks[i].internal); + PADDLE_ENFORCE(step_scope_var != nullptr, "%s not in scope", + outlinks[i].internal); + fmw::DDim step_dims = + step_scope_var->template GetMutable()->dims(); + std::vector dims_vec = vectorize(step_dims); + dims_vec.insert(dims_vec.begin(), seq_len); + output->Resize(fmw::make_ddim(dims_vec)); + } else { + output->mutable_data(platform::CPUPlace()); + for (size_t j = 0; j < seq_len; j++) { + Tensor* step_output = + step_scopes[j]->FindVar(outlinks[i].internal)->GetMutable(); + // TODO(luotao02) data type and platform::DeviceContext() should set + // correctly + (output->Slice(j, j + 1)) + .CopyFrom(*step_output, platform::CPUPlace()); + } + } + } +} + +void LinkMemories(const std::vector& scopes, + const std::vector& memories, + const size_t step_id, const int offset, + bool infer_shape_mode) { + PADDLE_ENFORCE_LT(step_id, scopes.size(), + "step [%d] is out of range of step scopes' size [%d]", + step_id, scopes.size()); + PADDLE_ENFORCE_GE(static_cast(step_id) + offset, 0, + "offset [%d] must be large than -[%d]", offset, step_id); + PADDLE_ENFORCE_LT( + step_id + offset, scopes.size(), + "offset [%d] is out of range, it must be less than (%d - %d)", offset, + scopes.size(), step_id); + auto scope = scopes[step_id]; + auto linked_scope = scopes[step_id + offset]; + for (auto& attr : memories) { + auto mem = scope->FindVar(attr.pre_var)->GetMutable(); + auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable(); + if (infer_shape_mode) { + mem->Resize(linked_mem->dims()); + } else { + mem->ShareDataWith(*linked_mem); + } + } +} + +void InitArgument(const ArgumentName& name, Argument* arg, + const OperatorBase& op) { + arg->step_net = op.Input(name.step_net); + arg->step_scopes = op.Output(name.step_scopes); + + auto inlinks = op.Inputs(name.inlinks); + auto inlink_alias = op.GetAttr>(name.inlink_alias); + PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(), + "the size of inlinks and inlink_alias don't match:%d,%d", + inlinks.size(), inlink_alias.size()); + for (size_t i = 0; i < inlinks.size(); ++i) { + rnn::Link link; + link.external = inlinks[i]; + link.internal = inlink_alias[i]; + (arg->inlinks).push_back(link); + } + + auto outlinks = op.Outputs(name.outlinks); + auto outlink_alias = op.GetAttr>(name.outlink_alias); + PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(), + "the size of outlinks and outlink_alias don't match:%d,%d", + outlinks.size(), outlink_alias.size()); + for (size_t i = 0; i < outlinks.size(); ++i) { + rnn::Link link; + link.external = outlinks[i]; + link.internal = outlink_alias[i]; + (arg->outlinks).push_back(link); + } + + auto boot_memories = op.Inputs(name.boot_memories); + + // attributes + auto memories = op.GetAttr>(name.memories); + auto pre_memories = op.GetAttr>(name.pre_memories); + + PADDLE_ENFORCE(memories.size() == boot_memories.size(), + "the size of memories, boot_memories don't match:%d,%d", + memories.size(), boot_memories.size()); + PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(), + "the size of pre_memories, boot_memories don't match:%d,%d", + pre_memories.size(), boot_memories.size()); + PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set"); + + for (size_t i = 0; i < memories.size(); ++i) { + rnn::MemoryAttr mem_attr; + mem_attr.var = memories[i]; + mem_attr.pre_var = pre_memories[i]; + mem_attr.boot_var = boot_memories[i]; + (arg->memories).push_back(mem_attr); + } +} + +} // namespace rnn +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..379754b98fcead6debe0a60efa62fce4b7761940 --- /dev/null +++ b/paddle/operators/rnn/recurrent_op_utils.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include + +#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" + +namespace paddle { +namespace operators { +namespace rnn { + +/** + * Memory of a RNN (same as the role of `Momory` in PaddlePaddle). + * + * Memory attributes cached by this op, dims will be infered from + * boot memories in father scope. Other attributes are copied from Op's proto + * attributes. + */ +struct MemoryAttr { + // name of current state variable + std::string var; + // name of previous step's state variable + std::string pre_var; + // name of the variables to init this memory (same role of `boot_layer` in + // PaddlePaddle), which is store in father's scope. + std::string boot_var; +}; + +struct Link { + // input or output links name. + std::string internal; + // alias to avoid duplicate keys in scopes. + std::string external; +}; + +struct Argument { + std::string step_net; + std::string step_scopes; + std::vector inlinks; + std::vector outlinks; + std::vector memories; +}; + +struct ArgumentName { + std::string step_net; + std::string step_scopes; + std::string inlinks; + std::string outlinks; + std::string inlink_alias; // the alias of inlinks in step net. + std::string outlink_alias; // the alias of outlinks in step net. + std::string memories; // the memory name + std::string pre_memories; // the previous memory name + std::string boot_memories; // the boot memory name +}; + +/** + * Prepare inputs for each step net. + */ +void SegmentInputs(const std::vector& step_scopes, + const std::vector& inlinks, const size_t seq_len, + bool infer_shape_mode); + +/** + * Process outputs of step nets and merge to variables. + */ +void ConcatOutputs(const std::vector& step_scopes, + const std::vector& outlinks, const size_t seq_len, + bool infer_shape_mode); + +void LinkMemories(const std::vector& step_scopes, + const std::vector& memories, const size_t step_id, + const int offset, bool infer_shape_mode); + +void InitArgument(const ArgumentName& name, Argument* arg, + const OperatorBase& op); + +} // namespace rnn +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc index 178ea3c6145e00979b4eed1de99e81d1dd587fb4..01cb6b1fb5e64a6865c78fb30435d8e973cf387d 100644 --- a/paddle/operators/rowwise_add_op.cc +++ b/paddle/operators/rowwise_add_op.cc @@ -17,7 +17,7 @@ namespace paddle { namespace operators { class RowwiseAddOp : public OperatorWithKernel { -protected: + protected: void InferShape(const InferShapeContext &ctx) const override { PADDLE_ENFORCE(ctx.InputSize() == 2UL, "Two inputs is needed by rowwise add"); @@ -33,7 +33,7 @@ protected: }; class RowwiseAddOpMaker : public OpProtoAndCheckerMaker { -public: + public: RowwiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The left input of row-wise add op, must be matrix"); @@ -47,7 +47,7 @@ for i in xrange(X.shape[0]): } }; class RowwiseAddGradOp : public OperatorWithKernel { -protected: + protected: void InferShape(const InferShapeContext &ctx) const override { PADDLE_ENFORCE(ctx.InputSize() == 4UL, "RowwiseAddGrad inputs is I, O, OG, size must be 4"); diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu index f48dfeb6f2c516d8c1096885ad60dc333def6b1f..b277e0644ae6e1e9dbeb30ba45683d4b5331b558 100644 --- a/paddle/operators/rowwise_add_op.cu +++ b/paddle/operators/rowwise_add_op.cu @@ -1,3 +1,18 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU #include "paddle/operators/rowwise_add_op.h" REGISTER_OP_GPU_KERNEL(rowwise_add, diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h index 321f51e61d472ede6cfc923fcf2a3d45324abd23..06af88a993d19bb03ae468b468cbfef3b782d5f0 100644 --- a/paddle/operators/rowwise_add_op.h +++ b/paddle/operators/rowwise_add_op.h @@ -20,7 +20,7 @@ namespace operators { template class RowwiseAddKernel : public OpKernel { -public: + public: void Compute(const ExecutionContext& context) const override { auto out = context.Output(0); out->mutable_data(context.GetPlace()); @@ -33,14 +33,14 @@ public: const int rest_size = input.size() / bias_size; Eigen::DSizes one_d(input.size()); Eigen::DSizes bcast(rest_size); - output.reshape(one_d).device(*(context.GetEigenDevice())) = + output.reshape(one_d).device(context.GetEigenDevice()) = input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d); } }; template class RowwiseAddGradKernel : public OpKernel { -public: + public: void Compute(const ExecutionContext& context) const override { auto XGrad = context.Output(0); auto bGrad = context.Output(1); diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc index 9a84dc8af3b3e649b776ca8a97dedba1fa3ff48d..e0532f2f090aecead499ccef8afb117876be5c78 100644 --- a/paddle/operators/sgd_op.cc +++ b/paddle/operators/sgd_op.cc @@ -18,13 +18,13 @@ namespace paddle { namespace operators { class SGDOp : public OperatorWithKernel { -protected: + protected: void InferShape(const InferShapeContext &ctx) const override { - PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of SGDOp must be two"); - PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of SGDOp must be one"); - PADDLE_ENFORCE(ctx.InputVar(0) != nullptr, "inputs[0] mast be set"); - PADDLE_ENFORCE(ctx.InputVar(1) != nullptr, "inputs[1] mast be set"); - PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr, "outputs[0] mast be set"); + PADDLE_ENFORCE_EQ(ctx.InputSize(), 2, "Input size of SGDOp must be two"); + PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1, "Output size of SGDOp must be one"); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(0), "inputs[0] mast be set"); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(1), "inputs[1] mast be set"); + PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar(0), "outputs[0] mast be set"); PADDLE_ENFORCE(ctx.Input(0)->dims() == ctx.Input(1)->dims(), "Two input of SGD Op's dimension must be same."); ctx.Output(0)->Resize(ctx.Input(0)->dims()); @@ -32,7 +32,7 @@ protected: }; class SGDOpMaker : public OpProtoAndCheckerMaker { -public: + public: SGDOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("param", "input parameter"); diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu index f8f5b90cab460b4457cfb0a88bfc012bafe0fbc2..72629ccfbb8bc8ec53045289bd985c721c62fa10 100644 --- a/paddle/operators/sgd_op.cu +++ b/paddle/operators/sgd_op.cu @@ -1,3 +1,18 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU #include "paddle/operators/sgd_op.h" -REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel); \ No newline at end of file +REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel); diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h index af1dfdd756ceb9991bee6b85c3281c05f0fb5a9f..bf5b195933fce7faa46bcc96032e784076178cf7 100644 --- a/paddle/operators/sgd_op.h +++ b/paddle/operators/sgd_op.h @@ -20,7 +20,7 @@ namespace operators { template class SGDOpKernel : public OpKernel { -public: + public: void Compute(const ExecutionContext& ctx) const override { auto param = ctx.Input("param"); auto grad = ctx.Input("grad"); @@ -29,8 +29,12 @@ public: param_out->mutable_data(ctx.GetPlace()); - EigenVector::Flatten(*param_out).device(*(ctx.GetEigenDevice())) = - EigenVector::Flatten(*param) - lr * EigenVector::Flatten(*grad); + auto p = EigenVector::Flatten(*param); + auto g = EigenVector::Flatten(*grad); + auto o = EigenVector::Flatten(*param_out); + auto place = ctx.GetEigenDevice(); + + o.device(place) = p - lr * g; } }; diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc index a81ab262cc6fe7bdff0045259e0030f3d46f503f..1eb795faa858796f7a34aa495b43d043fdb5dd43 100644 --- a/paddle/operators/sigmoid_op.cc +++ b/paddle/operators/sigmoid_op.cc @@ -17,7 +17,7 @@ namespace paddle { namespace operators { class SigmoidOp : public OperatorWithKernel { -protected: + protected: void InferShape(const InferShapeContext &ctx) const override { PADDLE_ENFORCE(ctx.InputSize() == 1, "Sigmoid Op only have one input"); PADDLE_ENFORCE(ctx.OutputSize() == 1, "Sigmoid Op only have one output"); @@ -26,7 +26,7 @@ protected: }; class SigmoidOpMaker : public OpProtoAndCheckerMaker { -public: + public: SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "sigmoid input"); @@ -36,11 +36,9 @@ public: }; class SigmoidOpGrad : public OperatorWithKernel { -protected: - void InferShape(const InferShapeContext &ctx) const override {} - std::string DebugString() const override { - LOG(INFO) << "SigmoidGrad"; - return ""; + protected: + void InferShape(const InferShapeContext &ctx) const override { + ctx.Output(0)->Resize(ctx.Input(0)->dims()); } }; @@ -51,3 +49,5 @@ REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker); REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, ops::SigmoidOpGrad); REGISTER_OP_CPU_KERNEL(sigmoid, ops::SigmoidKernel); +REGISTER_OP_CPU_KERNEL(sigmoid_grad, + ops::SigmoidGradKernel); diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu index f679b20418f04eff4310efe4e121963ce5a235e0..e80ba081f2ff805664cf92f3cb47e9ad51889058 100644 --- a/paddle/operators/sigmoid_op.cu +++ b/paddle/operators/sigmoid_op.cu @@ -1,3 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU #include "paddle/operators/sigmoid_op.h" REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel); +REGISTER_OP_GPU_KERNEL(sigmoid_grad, + ops::SigmoidGradKernel); diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h index 3dd23a9ebc7ac0972d6ee07b9ac051d59e66f62f..d513261e74423ce93a50eaaaec1c7d5fadb8f4a8 100644 --- a/paddle/operators/sigmoid_op.h +++ b/paddle/operators/sigmoid_op.h @@ -21,16 +21,37 @@ namespace operators { template class SigmoidKernel : public OpKernel { -public: + public: void Compute(const ExecutionContext& context) const override { auto input = context.Input(0); auto output = context.Output(0); output->mutable_data(context.GetPlace()); - EigenVector::Flatten(*output).device( - *(context.GetEigenDevice())) = - 1.0 / (1.0 + (-1.0 * EigenVector::Flatten(*input)).exp()); + // The clipping is used in Paddle's raw implenmention + auto X = EigenVector::Flatten(*input); + auto Y = EigenVector::Flatten(*output); + auto place = context.GetEigenDevice(); + + Y.device(place) = 1.0 / (1.0 + (-1.0 * X).exp()); } }; + +template +class SigmoidGradKernel : public OpKernel { + public: + void Compute(const ExecutionContext& context) const override { + auto Y_t = context.Input("Y"); + auto dY_t = context.Input(framework::GradVarName("Y")); + auto dX_t = context.Output(framework::GradVarName("X")); + + dX_t->mutable_data(context.GetPlace()); + + auto dX = EigenVector::Flatten(*dX_t); + auto Y = EigenVector::Flatten(*Y_t); + auto dY = EigenVector::Flatten(*dY_t); + dX.device(context.GetEigenDevice()) = dY * Y * (1. - Y); + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc index 5b59fad7d5f9729b0862f8cd78cb32f94f87f513..c08e1b153c05baa474bcd344c1e87405193cb688 100644 --- a/paddle/operators/softmax_op.cc +++ b/paddle/operators/softmax_op.cc @@ -1,35 +1,37 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ #include "paddle/operators/softmax_op.h" namespace paddle { namespace operators { class SoftmaxOp : public OperatorWithKernel { -protected: + protected: void InferShape(const InferShapeContext &ctx) const override { - PADDLE_ENFORCE(ctx.InputSize() == 1, "Only one input is need for softmax"); - PADDLE_ENFORCE(ctx.Input(0)->dims().size() == 2, - "The input of softmax op must be matrix"); - PADDLE_ENFORCE(ctx.OutputSize() == 1, - "Only one output is need for softmax"); - ctx.Output(0)->Resize(ctx.Input(0)->dims()); + PADDLE_ENFORCE_EQ(ctx.InputSize(), 1UL, + "Only one input is need for softmax"); + PADDLE_ENFORCE_EQ(ctx.Input("X")->dims().size(), 2UL, + "The input of softmax op must be matrix"); + PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1UL, + "Only one output is need for softmax"); + ctx.Output("Y")->Resize(ctx.Input("X")->dims()); } }; class SoftmaxOpMaker : public OpProtoAndCheckerMaker { -public: + public: SoftmaxOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "input of softmax"); @@ -39,11 +41,20 @@ public: }; class SoftmaxOpGrad : public OperatorWithKernel { -protected: - void InferShape(const InferShapeContext &ctx) const override {} - std::string DebugString() const override { - LOG(INFO) << "SoftmaxOpGrad"; - return ""; + protected: + void InferShape(const InferShapeContext &ctx) const override { + PADDLE_ENFORCE_EQ(ctx.InputSize(), 3UL, + "Input of SoftmaxOpGrad should be 3, X, Y, YG"); + PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1UL, + "Output of SoftmaxOpGrad should be 1"); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null"); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")), + "Input(Y@GRAD) should not be null"); + PADDLE_ENFORCE(ctx.Input("Y")->dims() == + ctx.Input(framework::GradVarName("Y"))->dims(), + "the shape of Input(0) and Input(1) should be the same"); + ctx.Output(framework::GradVarName("X")) + ->Resize(ctx.Input("Y")->dims()); } }; @@ -51,5 +62,7 @@ protected: } // namespace paddle REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker); -REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad); REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel); +REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad); +REGISTER_OP_CPU_KERNEL(softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu index a1f6944a369fe5148ffcfeabf3bf7063dcbc2664..b79228580a7ea0f70b62eb2dc7a61cf85bc0b5fb 100644 --- a/paddle/operators/softmax_op.cu +++ b/paddle/operators/softmax_op.cu @@ -1,4 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU #include "paddle/framework/op_registry.h" #include "paddle/operators/softmax_op.h" REGISTER_OP_GPU_KERNEL(softmax, ops::SoftmaxKernel); +REGISTER_OP_GPU_KERNEL(softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h index a5c19c5fc7c6f5909dbb355aff09bf15405b6957..b2dbcf57edf1a64da8da0d9a4c14d708eec17f3f 100644 --- a/paddle/operators/softmax_op.h +++ b/paddle/operators/softmax_op.h @@ -1,19 +1,22 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #pragma once +#include "paddle/framework/ddim.h" +#include "paddle/framework/operator.h" +#include "paddle/framework/tensor.h" #include "paddle/operators/type_alias.h" namespace paddle { @@ -21,10 +24,10 @@ namespace operators { template class SoftmaxKernel : public OpKernel { -public: + public: void Compute(const ExecutionContext& context) const override { - auto input = context.Input(0); - auto output = context.Output(0); + auto input = context.Input("X"); + auto output = context.Output("Y"); output->mutable_data(context.GetPlace()); auto logits = EigenMatrix::From(*input); @@ -46,9 +49,9 @@ public: .reshape(batch_by_one) .broadcast(one_by_class)); - softmax.device(*(context.GetEigenDevice())) = shifted_logits.exp(); + softmax.device(context.GetEigenDevice()) = shifted_logits.exp(); - softmax.device(*(context.GetEigenDevice())) = + softmax.device(context.GetEigenDevice()) = (softmax * softmax.sum(along_class) .inverse() @@ -57,5 +60,38 @@ public: .broadcast(one_by_class)); } }; + +template +class SoftmaxGradKernel : public OpKernel { + public: + void Compute(const ExecutionContext& context) const override { + std::shared_ptr scale_ = std::make_shared(); + + auto Y = context.Input("Y"); + auto dY = context.Input(framework::GradVarName("Y")); + auto dX = context.Output(framework::GradVarName("X")); + dX->mutable_data(context.GetPlace()); + + const int batch_size = Y->dims()[0]; + const int class_num = Y->dims()[1]; + + Eigen::DSizes along_class(1); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, class_num); + + auto Y_eigen = EigenMatrix::From(*Y); + auto dY_eigen = EigenMatrix::From(*dY); + auto dX_eigen = EigenMatrix::From(*dX); + auto place = context.GetEigenDevice(); + + auto dot = (Y_eigen * dY_eigen) + .sum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class); + dX_eigen.device(place) = (dY_eigen - dot) * Y_eigen; + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/operators/type_alias.h b/paddle/operators/type_alias.h index 93b62cddc819e0d1fd48323e474a294ff0d327e1..eac12d35dd8d2977191218167ebb0a6e638d5d73 100644 --- a/paddle/operators/type_alias.h +++ b/paddle/operators/type_alias.h @@ -15,42 +15,40 @@ #pragma once #include "paddle/framework/eigen.h" -#include "paddle/framework/net.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/net_op.h" namespace paddle { namespace operators { using OpKernel = framework::OpKernel; +using OperatorBase = framework::OperatorBase; using InferShapeContext = framework::InferShapeContext; using ExecutionContext = framework::ExecutionContext; using Variable = framework::Variable; -template using EigenScalar = framework::EigenScalar; -template using EigenVector = framework::EigenVector; -template using EigenMatrix = framework::EigenMatrix; -template using EigenTensor = framework::EigenTensor; using Tensor = framework::Tensor; +using Scope = framework::Scope; using OperatorWithKernel = framework::OperatorWithKernel; +using OperatorBase = framework::OperatorBase; using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker; using OpProto = framework::OpProto; using OpAttrChecker = framework::OpAttrChecker; using CPUPlace = platform::CPUPlace; using GPUPlace = platform::GPUPlace; -using NetOp = framework::NetOp; using OpRegistry = framework::OpRegistry; + } // namespace operators } // namespace paddle diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..405b84b76d2e24db25d2ff16e99495f2f132ef09 --- /dev/null +++ b/paddle/operators/uniform_random_op.cc @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" + +namespace paddle { +namespace operators { + +// It seems that Eigen::Tensor::random in GPU will SEGFAULT. +// Use std::random and thrust::random(thrust is a std library in CUDA) to +// implement uniform random. +template +class CPUUniformRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* tensor = context.Output(0); + T* data = tensor->mutable_data(context.GetPlace()); + unsigned int seed = + static_cast(context.op_.GetAttr("seed")); + std::minstd_rand engine; + if (seed == 0) { + seed = std::random_device()(); + } + engine.seed(seed); + std::uniform_real_distribution dist( + static_cast(context.op_.GetAttr("min")), + static_cast(context.op_.GetAttr("max"))); + for (ssize_t i = 0; i < framework::product(tensor->dims()); ++i) { + data[i] = dist(engine); + } + } +}; + +class UniformRandomOp : public framework::OperatorWithKernel { + protected: + void InferShape(const framework::InferShapeContext& ctx) const override { + PADDLE_ENFORCE(GetAttr("min") < GetAttr("max"), + "uniform_random's min must less then max"); + auto* tensor = ctx.Output(0); + auto dims = GetAttr>("dims"); + tensor->Resize(framework::make_ddim(dims)); + } +}; + +class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker { + public: + UniformRandomOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "The output tensor of uniform random op"); + AddComment(R"DOC(Uniform random operator. + +Used to initialize tensor with uniform random generator. +)DOC"); + AddAttr>("dims", "the dimension of random tensor"); + AddAttr("min", "Minimum value of uniform random").SetDefault(-1.0f); + AddAttr("max", "Maximun value of uniform random").SetDefault(1.0f); + AddAttr("seed", + "Random seed of uniform random. " + "0 means generate a seed by system") + .SetDefault(0); + } +}; +} // namespace operators +} // namespace paddle + +REGISTER_OP(uniform_random, paddle::operators::UniformRandomOp, + paddle::operators::UniformRandomOpMaker); +REGISTER_OP_CPU_KERNEL(uniform_random, + paddle::operators::CPUUniformRandomKernel); diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..f1a63e52ec0d3d46a505a89d7d7916bf93a58221 --- /dev/null +++ b/paddle/operators/uniform_random_op.cu @@ -0,0 +1,70 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include +#include +#include +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" + +namespace paddle { +namespace operators { + +template +struct UniformGenerator { + T min_, max_; + unsigned int seed_; + + __host__ __device__ UniformGenerator(T min, T max, int seed) + : min_(min), max_(max), seed_(seed) {} + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + thrust::uniform_real_distribution dist(min_, max_); + rng.discard(n); + return dist(rng); + } +}; + +// It seems that Eigen::Tensor::random in GPU will SEGFAULT. +// Use std::random and thrust::random(thrust is a std library in CUDA) to +// implement uniform random. +template +class GPUUniformRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* tensor = context.Output(0); + T* data = tensor->mutable_data(context.GetPlace()); + unsigned int seed = + static_cast(context.op_.GetAttr("seed")); + if (seed == 0) { + seed = std::random_device()(); + } + T min = static_cast(context.op_.GetAttr("min")); + T max = static_cast(context.op_.GetAttr("max")); + thrust::counting_iterator index_sequence_begin(0); + ssize_t N = framework::product(tensor->dims()); + thrust::transform(index_sequence_begin, index_sequence_begin + N, + thrust::device_ptr(data), + UniformGenerator(min, max, seed)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_GPU_KERNEL(uniform_random, + paddle::operators::GPUUniformRandomKernel); diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index ef72b973c1a465a8ac03cae1070429160eac0ac1..0547ac93cd183afbcede41d280c6b4b16ed7dab1 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -666,4 +666,24 @@ void Argument::subArgFrom(const Argument& input, } } +void Argument::reorganizeSeqInfo( + const ICpuGpuVectorPtr seqStartPos, + const ICpuGpuVectorPtr subSeqStartPos, + std::vector>& reorganizedSeqInfo) { + int* seqStarts = seqStartPos->getMutableData(false); + int* subSeqStarts = subSeqStartPos->getMutableData(false); + + int seqNum = seqStartPos->getSize() - 1; + reorganizedSeqInfo.resize(seqNum, std::vector()); + int seqIdx = 0; + for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) { + reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]); + if (subSeqStarts[i] == seqStarts[seqIdx + 1]) { + seqIdx++; + if (seqIdx == seqNum) return; + reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]); + } + } +} + } // namespace paddle diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h index 0ccdef802e71b659788cfd24f28ebe43e1917db1..d8d7a4398f99a2794c5d25528a7d582f5ed629ba 100644 --- a/paddle/parameter/Argument.h +++ b/paddle/parameter/Argument.h @@ -317,6 +317,30 @@ struct Argument { */ void printValueString(std::ostream& stream, const std::string& prefix = "") const; + + /** + * @brief reorganizeSeqInfo will reorganize sequenceStartPositions and + * subSequenceStartPositions into a 2 dimensional arrary: reorganizedSeqInfo. + * + * @param seqStartPos: sequenceStartPositions of an Argument. + * @param subSeqStartPos: subSequenceStartPositions of an Argument. + * @param the reorganized sequence start position information. + * + * Examples: + * seqStartPos: [0, 4, 15, 20, 28] + * subSeqStartPos: [0, 3, 4, 5, 7, 10, 15, 20, 22, 23, 25, 28] + * reorganizedSeqInfo: + * [ + * [0,3,4], + * [4,5,7,10,15], + * [15,20], + * [20,22,23,25,28] + * ] + */ + static void reorganizeSeqInfo( + const ICpuGpuVectorPtr seqStartPos, + const ICpuGpuVectorPtr subSeqStartPos, + std::vector>& reorganizedSeqInfo); }; } // namespace paddle diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 2038fafe2e15ec2631726643695ac6cbc317fed9..08b5b2cff900cc4239a615fe7d7f6b5faa13510b 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -40,7 +40,7 @@ class DeviceContext { class CPUDeviceContext : public DeviceContext { public: CPUDeviceContext(); - CPUDeviceContext(CPUPlace); + explicit CPUDeviceContext(CPUPlace); virtual ~CPUDeviceContext() {} Eigen::DefaultDevice* eigen_device() const; @@ -69,10 +69,10 @@ class CUDADeviceContext : public DeviceContext { // clang-format off /*! \brief Return cublas handle in the device context. */ - cublasHandle_t cublas_handle (); + cublasHandle_t cublas_handle(); /*! \brief Return cudnn handle in the device context. */ - cudnnHandle_t cudnn_handle (); + cudnnHandle_t cudnn_handle(); /*! \brief Return curand handle in the device context. */ curandGenerator_t curand_generator(); diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc index af2ce17fc2238dda62e9888ebe9426edcd55d2bc..65345c433c0a328e7f89038a39312edba35eb8c7 100644 --- a/paddle/platform/device_context_test.cc +++ b/paddle/platform/device_context_test.cc @@ -15,24 +15,28 @@ limitations under the License. */ #include "paddle/platform/device_context.h" #include "gtest/gtest.h" -using DEVICE_GPU = Eigen::GpuDevice; TEST(Device, Init) { + using paddle::platform::DeviceContext; + using paddle::platform::CUDADeviceContext; + using paddle::platform::GPUPlace; + int count = paddle::platform::GetDeviceCount(); for (int i = 0; i < count; i++) { - paddle::platform::DeviceContext* device_context = - new paddle::platform::CUDADeviceContext(i); + DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); Eigen::GpuDevice* gpu_device = - device_context->template get_eigen_device(); + device_context->template get_eigen_device(); ASSERT_NE(nullptr, gpu_device); delete device_context; } } TEST(Device, CUDADeviceContext) { + using paddle::platform::CUDADeviceContext; + using paddle::platform::GPUPlace; + int count = paddle::platform::GetDeviceCount(); for (int i = 0; i < count; i++) { - paddle::platform::CUDADeviceContext* device_context = - new paddle::platform::CUDADeviceContext(i); + CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); Eigen::GpuDevice* gpu_device = device_context->eigen_device(); ASSERT_NE(nullptr, gpu_device); cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc index 4e3dfdaefb2348346e8f917b1f6c758bf6d91a1a..9cd2a1f565526f8dc45932ba6168f4e25c6ad238 100644 --- a/paddle/platform/dynload/cublas.cc +++ b/paddle/platform/dynload/cublas.cc @@ -1,3 +1,17 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #include namespace paddle { diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc index 8b5e15b5efcdae6a1eed09f002eb2f4f2163035f..d3e4cb567d71b987724366b6a0896f5df0eb6055 100644 --- a/paddle/platform/dynload/cudnn.cc +++ b/paddle/platform/dynload/cudnn.cc @@ -1,3 +1,17 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #include namespace paddle { @@ -25,4 +39,4 @@ CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); } // namespace dynload } // namespace platform -} // namespace paddle \ No newline at end of file +} // namespace paddle diff --git a/paddle/platform/dynload/curand.cc b/paddle/platform/dynload/curand.cc index 5c1fab992c98569d4a95b6e699d97d428511e48e..d05dd88126bfee7278e553710a717b8f2eb02ae0 100644 --- a/paddle/platform/dynload/curand.cc +++ b/paddle/platform/dynload/curand.cc @@ -1,3 +1,17 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #include namespace paddle { @@ -10,6 +24,7 @@ void *curand_dso_handle; #define DEFINE_WRAP(__name) DynLoad__##__name __name CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); -} -} -} \ No newline at end of file + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h index 26c8eb78e614a68ec9728aad727d8fe3e08547ae..d2adb997de8e36922d5056b20f238a82eee74f8c 100644 --- a/paddle/platform/enforce.h +++ b/paddle/platform/enforce.h @@ -144,12 +144,12 @@ inline void throw_on_error(T e) { throw_on_error(e, ""); } -#define PADDLE_THROW(...) \ - do { \ - throw ::paddle::platform::EnforceNotMet( \ - std::make_exception_ptr( \ - std::runtime_error(string::Sprintf(__VA_ARGS__))), \ - __FILE__, __LINE__); \ +#define PADDLE_THROW(...) \ + do { \ + throw ::paddle::platform::EnforceNotMet( \ + std::make_exception_ptr( \ + std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \ + __FILE__, __LINE__); \ } while (0) #define PADDLE_ENFORCE(...) \ @@ -162,5 +162,41 @@ inline void throw_on_error(T e) { } \ } while (0) +/* + * Some enforce helpers here, usage: + * int a = 1; + * int b = 2; + * PADDLE_ENFORCE_EQ(a, b); + * + * will raise an expression described as follows: + * "enforce a == b failed, 1 != 2" with detailed stack infomation. + * + * extra messages is also supported, for example: + * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) + */ + +#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__) +#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__) +#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__) +#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__) +#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) +#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) +#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ + PADDLE_ENFORCE(nullptr != (__VAL), #__VAL " should not be null\n%s", \ + paddle::string::Sprintf("" __VA_ARGS__)); + +#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ + PADDLE_ENFORCE(__VAL0 __CMP __VAL1, \ + "enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \ + #__VAL0, #__VAL1, std::to_string(__VAL0), \ + std::to_string(__VAL1), \ + paddle::string::Sprintf("" __VA_ARGS__)); + } // namespace platform } // namespace paddle diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc index 2ac31812a80d8dd57ce82234cb5835e029a46067..4dfb69754608cb1120baa295072c3d031a4e1a7b 100644 --- a/paddle/platform/enforce_test.cc +++ b/paddle/platform/enforce_test.cc @@ -9,8 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/platform/enforce.h" +#include + #include "gtest/gtest.h" +#include "paddle/platform/enforce.h" TEST(ENFORCE, OK) { PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345); @@ -34,3 +36,189 @@ TEST(ENFORCE, FAILED) { } ASSERT_TRUE(in_catch); } + +TEST(ENFORCE, NO_ARG_OK) { + int a = 2; + int b = 2; + PADDLE_ENFORCE_EQ(a, b); + // test enforce with extra message. + PADDLE_ENFORCE_EQ(a, b, "some thing wrong %s", "info"); +} + +TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) { + int a = 2; + bool in_catch = false; + + try { + PADDLE_ENFORCE_EQ(a, 1 + 3); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = "enforce a == 1 + 3 failed, 2 != 4"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} + +TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) { + int a = 2; + bool in_catch = false; + + try { + PADDLE_ENFORCE_EQ(a, 1 + 3, "%s size not match", "their"); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = + "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} + +TEST(ENFORCE_NE, OK) { + PADDLE_ENFORCE_NE(1, 2); + PADDLE_ENFORCE_NE(1.0, 2UL); +} +TEST(ENFORCE_NE, FAIL) { + bool in_catch = false; + + try { + // 2UL here to check data type compatible + PADDLE_ENFORCE_NE(1.0, 1UL); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = "enforce 1.0 != 1UL failed, 1.000000 == 1"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} + +TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); } +TEST(ENFORCE_GT, FAIL) { + bool in_catch = false; + + try { + // 2UL here to check data type compatible + PADDLE_ENFORCE_GT(1, 2UL); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = "enforce 1 > 2UL failed, 1 <= 2"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} + +TEST(ENFORCE_GE, OK) { + PADDLE_ENFORCE_GE(2, 2UL); + PADDLE_ENFORCE_GE(3, 2UL); + PADDLE_ENFORCE_GE(3, 2); + PADDLE_ENFORCE_GE(3.21, 2UL); +} +TEST(ENFORCE_GE, FAIL) { + bool in_catch = false; + + try { + PADDLE_ENFORCE_GE(1, 2UL); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = "enforce 1 >= 2UL failed, 1 < 2"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} + +TEST(ENFORCE_LE, OK) { + PADDLE_ENFORCE_LE(1, 1); + PADDLE_ENFORCE_LE(1, 1UL); + PADDLE_ENFORCE_LE(2, 3UL); + PADDLE_ENFORCE_LE(2UL, 3); + PADDLE_ENFORCE_LE(2UL, 3.2); +} +TEST(ENFORCE_LE, FAIL) { + bool in_catch = false; + + try { + PADDLE_ENFORCE_GT(1, 2UL); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = "enforce 1 > 2UL failed, 1 <= 2"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} + +TEST(ENFORCE_LT, OK) { + PADDLE_ENFORCE_LT(3, 10); + PADDLE_ENFORCE_LT(2, 3UL); + PADDLE_ENFORCE_LT(2UL, 3); +} +TEST(ENFORCE_LT, FAIL) { + bool in_catch = false; + + try { + PADDLE_ENFORCE_LT(1UL, 0.12); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = "enforce 1UL < 0.12 failed, 1 >= 0.12"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} + +TEST(ENFORCE_NOT_NULL, OK) { + int* a = new int; + PADDLE_ENFORCE_NOT_NULL(a); + delete a; +} +TEST(ENFORCE_NOT_NULL, FAIL) { + bool in_catch = false; + int* a{nullptr}; + + try { + PADDLE_ENFORCE_NOT_NULL(a); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = "a should not be null"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} diff --git a/paddle/platform/place.h b/paddle/platform/place.h index 7cead183884bc9379355cd931921b40d6c11ce90..a82e8c942fa28297d91056a66b61f085f2bdb946 100644 --- a/paddle/platform/place.h +++ b/paddle/platform/place.h @@ -32,7 +32,7 @@ struct CPUPlace { struct GPUPlace { GPUPlace() : GPUPlace(0) {} - GPUPlace(int d) : device(d) {} + explicit GPUPlace(int d) : device(d) {} // needed for variant equality comparison inline bool operator==(const GPUPlace &o) const { return device == o.device; } diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index 845589dcb1997b662b5175e5cce320eec4be4a8d..8e6b258e00c0012876cda8ffc5b340322d51e894 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -1,9 +1,10 @@ cc_library(paddle_pybind SHARED SRCS pybind.cc - DEPS pybind python + DEPS pybind python backward fc_op sgd_op add_op mean_op cross_entropy_op - recurrent_network_op) + recurrent_op + fill_zeros_like_op) diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc deleted file mode 100644 index 801ef50e577d563f4534f33e49aa7b72ab840d89..0000000000000000000000000000000000000000 --- a/paddle/pybind/pybind.cc +++ /dev/null @@ -1,180 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/framework/net.h" -#include "paddle/framework/op_registry.h" -#include "paddle/framework/operator.h" -#include "paddle/framework/scope.h" -#include "paddle/pybind/tensor_bind.h" -#include "pybind11/numpy.h" -#include "pybind11/pybind11.h" -#include "pybind11/stl.h" - -namespace py = pybind11; -namespace pd = paddle::framework; - -USE_OP(add_two); -USE_OP(onehot_cross_entropy); -USE_OP_WITHOUT_KERNEL(fc); -USE_OP(sgd); -USE_OP(mul); -USE_OP(mean); -USE_OP(sigmoid); -USE_OP(softmax); -USE_OP(rowwise_add); -USE_OP_WITHOUT_KERNEL(recurrent_op); - -template -void ExposeOperator(ClassType& m) { - m.def("infer_shape", &ClassType::type::InferShape) - .def("run", &ClassType::type::Run) - .def("outputs", - [](const typename ClassType::type& op) -> std::vector { - return op.outputs_; - }) - .def("__str__", &ClassType::type::DebugString); -} - -static size_t UniqueIntegerGenerator() { - static std::atomic generator; - return generator.fetch_add(1); -} - -PYBIND11_PLUGIN(core) { - py::module m("core", "C++ core of PaddlePaddle"); - - py::class_(m, "Tensor", py::buffer_protocol()) - .def_buffer([](pd::Tensor& self) -> py::buffer_info { - return paddle::pybind::CastToPyBuffer(self); - }) - .def("get_dims", - [](const pd::Tensor& self) { return pd::vectorize(self.dims()); }) - .def("set_dims", - [](pd::Tensor& self, const std::vector& dim) { - self.Resize(pd::make_ddim(dim)); - }) - .def("alloc_float", - [](pd::Tensor& self) { - self.mutable_data(paddle::platform::CPUPlace()); - }) - .def("alloc_int", - [](pd::Tensor& self) { - self.mutable_data(paddle::platform::CPUPlace()); - }) - .def("set", paddle::pybind::PyTensorSetFromArray) - .def("set", paddle::pybind::PyTensorSetFromArray) - .def("shape", - [](pd::Tensor& self) { return pd::vectorize(self.dims()); }); - - py::class_(m, "Variable", R"DOC(Variable Class. - -All parameter, weight, gradient are variables in Paddle. -)DOC") - .def("is_int", [](const pd::Variable& var) { return var.IsType(); }) - .def("set_int", - [](pd::Variable& var, int val) -> void { - *var.GetMutable() = val; - }) - .def("get_int", - [](const pd::Variable& var) -> int { return var.Get(); }) - .def("get_tensor", - [](pd::Variable& self) -> pd::Tensor* { - return self.GetMutable(); - }, - py::return_value_policy::reference) - .def("get_net", - [](pd::Variable& self) -> pd::NetOp* { - return self.GetMutable(); - }, - py::return_value_policy::reference); - - py::class_(m, "Scope", "") - .def("new_var", - [](pd::Scope& self, const std::string& name) -> pd::Variable* { - return self.NewVar(name); - }, - py::return_value_policy::reference) - .def("find_var", &pd::Scope::FindVar, py::return_value_policy::reference) - .def(py::init<>()) - .def("new_scope", - [](pd::Scope& self) -> pd::Scope* { return &self.NewScope(); }, - py::return_value_policy::reference) - .def("drop_kids", &pd::Scope::DropKids); - - //! @note: Be careful! PyBind will return std::string as an unicode, not - //! Python str. If you want a str object, you should cast them in Python. - m.def("get_all_op_protos", []() -> std::vector { - auto& protos = pd::OpRegistry::protos(); - std::vector ret_values; - for (auto it = protos.begin(); it != protos.end(); ++it) { - PADDLE_ENFORCE(it->second.IsInitialized(), - "OpProto must all be initialized"); - std::string str; - PADDLE_ENFORCE(it->second.SerializeToString(&str), - "Serialize OpProto Error. This could be a bug of Paddle."); - ret_values.push_back(py::bytes(str)); - } - return ret_values; - }); - m.def_submodule( - "var_names", - "The module will return special predefined variable name in Paddle") - .def("empty", pd::OperatorBase::EMPTY_VAR_NAME) - .def("temp", pd::OperatorBase::TMP_VAR_NAME); - - py::class_(m, "DeviceContext") - .def_static("cpu_context", []() -> paddle::platform::DeviceContext* { - return new paddle::platform::CPUDeviceContext(); - }); - - py::class_> operator_base( - m, "Operator"); - - operator_base.def_static("create", [](py::bytes protobin) { - pd::OpDesc desc; - PADDLE_ENFORCE(desc.ParsePartialFromString(protobin), - "Cannot parse user input to OpDesc"); - PADDLE_ENFORCE(desc.IsInitialized(), - "User OpDesc is not initialized, reason %s", - desc.InitializationErrorString()); - return pd::OpRegistry::CreateOp(desc); - }); - ExposeOperator(operator_base); - - py::class_> net(m, "Net"); - - net.def_static("create", - []() -> std::shared_ptr { - auto retv = std::make_shared(); - retv->type_ = "plain_net"; - return retv; - }) - .def("add_op", &pd::NetOp::AddOp) - .def("add_op", - [](pd::NetOp& self, const std::shared_ptr& net) -> void { - self.AddOp(std::static_pointer_cast(net)); - }) - .def("complete_add_op", &pd::NetOp::CompleteAddOp) - .def("complete_add_op", - [](std::shared_ptr& self) { self->CompleteAddOp(); }); - ExposeOperator(net); - - m.def("unique_integer", UniqueIntegerGenerator); - - return m.ptr(); -} diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt index 66a46e1883a49d491f0cb3056a7039407d72e337..a52f06fe497dac467e4ef2543ebda7a423ca326d 100644 --- a/paddle/scripts/CMakeLists.txt +++ b/paddle/scripts/CMakeLists.txt @@ -1,17 +1,15 @@ configure_file(submit_local.sh.in - submit_local.sh + paddle @ONLY) -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle DESTINATION bin PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ - GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ - RENAME paddle) + GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ) configure_file(tools/usage_stat/usage.sh - usage.sh + paddle_usage @ONLY) -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/usage.sh DESTINATION opt/paddle/bin +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle_usage DESTINATION opt/paddle/bin PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ - GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ - RENAME paddle_usage) + GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 3860facb099950a5287d3f6b89c3de38f588f568..44442be4729ff77e8d378c93acebe1486eb75397 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -33,58 +33,71 @@ Configuring cmake in /paddle/build ... -DWITH_AVX=${WITH_AVX:-OFF} -DWITH_GOLANG=${WITH_GOLANG:-OFF} -DWITH_SWIG_PY=ON + -DWITH_C_API=${WITH_C_API:-OFF} + -DWITH_PYTHON=${WITH_PYTHON:-ON} + -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} -DCUDNN_ROOT=/usr/ -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} -DWITH_TESTING=${WITH_TESTING:-OFF} -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ======================================== EOF + +# Disable UNITTEST_USE_VIRTUALENV in docker because +# docker environment is fully controlled by this script. +# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option. cmake .. \ -DCMAKE_BUILD_TYPE=Release \ -DWITH_DOC=OFF \ -DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_AVX=${WITH_AVX:-OFF} \ -DWITH_GOLANG=${WITH_GOLANG:-OFF} \ - -DWITH_SWIG_PY=ON \ + -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \ + -DWITH_C_API=${WITH_C_API:-OFF} \ + -DWITH_PYTHON=${WITH_PYTHON:-ON} \ -DCUDNN_ROOT=/usr/ \ -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \ -DWITH_TESTING=${WITH_TESTING:-OFF} \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON cat <> /paddle/build/Dockerfile < /dev/null -SCRIPTPATH=$PWD -popd > /dev/null - -USE_VIRTUALENV_FOR_TEST=$1; shift -PYTHON=$1; shift - -if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then - rm -rf .test_env - virtualenv .test_env - unset PYTHONHOME - unset PYTHONPATH - source .test_env/bin/activate - PYTHON=python -fi - -$PYTHON -m pip install $SCRIPTPATH/../dist/*.whl - -if [ "X${PADDLE_PACKAGE_DIR}" != "X" ]; then - $PYTHON -m pip install ${PADDLE_PACKAGE_DIR}/*.whl -else - export PYTHONPATH=$SCRIPTPATH/../../python/ -fi - -$PYTHON -m pip install ipython==5.3 - -for fn in "$@" -do - echo "test $fn" - $PYTHON $fn - if [ $? -ne 0 ]; then - exit 1 - fi -done - -if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then - deactivate - rm -rf .test_env -fi diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in old mode 100644 new mode 100755 diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh index a44385158042a23eca175df261852148642f7fa0..dfcff38302703066e868c60e213f0f7cbc55a31e 100755 --- a/paddle/scripts/travis/build_doc.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -5,15 +5,9 @@ set -e mkdir -p $TRAVIS_BUILD_DIR/build cd $TRAVIS_BUILD_DIR/build -# Compile paddle binaries first -cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_GOLANG=ON -DWITH_STYLE_CHECK=OFF - -mkdir output -make -j `nproc` -find .. -name '*whl' | xargs pip install # install all wheels. -rm -rf * # Compile Documentation only. -cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON +cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON +make -j `nproc` gen_proto_py make -j `nproc` paddle_docs paddle_docs_cn # check websites for broken links @@ -35,6 +29,7 @@ TARGET_BRANCH="gh-pages" SOURCE_BRANCH="master" # Clone the repo to output directory +mkdir output git clone $REPO output cd output diff --git a/paddle/setup.py.in b/paddle/setup.py.in deleted file mode 100644 index 06d55d3abc6097fa7d4b2b2ac9e29681e0fddfd5..0000000000000000000000000000000000000000 --- a/paddle/setup.py.in +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from setuptools import setup, Extension - -setup(name="py_paddle", - version="${PADDLE_VERSION}", - packages=['py_paddle'], - include_package_data=True, - package_data={'py_paddle':['*.py','_swig_paddle.so']}, - install_requires = [ - 'nltk>=3.2.2', - 'numpy>=1.8.0', # The numpy is required. - 'protobuf==${PROTOBUF_VERSION}' # The paddle protobuf version - ], - url='http://www.paddlepaddle.org/', - license='Apache 2.0', -) diff --git a/paddle/string/piece.h b/paddle/string/piece.h index 0272529d1c9b2cb6000a26f1d4d80276d06bf27b..03ae9243a4cc4e9e92e376bf46ab2b1d7162dfcb 100644 --- a/paddle/string/piece.h +++ b/paddle/string/piece.h @@ -39,8 +39,8 @@ public: // size_ is 0. Piece(); Piece(const char* d, size_t n); - Piece(const char* d); - Piece(const std::string& s); + Piece(const char* d); // NOLINT: accept C string into Piece. + Piece(const std::string& s); // NOLINT: accept C++ string into Piece. const char* data() const { return data_; } size_t len() const { return size_; } diff --git a/paddle/trainer/tests/compare_sparse_data b/paddle/trainer/tests/compare_sparse_data new file mode 100644 index 0000000000000000000000000000000000000000..18fc6541383d8e8e1687b8fe1abd57aece3d4cfc Binary files /dev/null and b/paddle/trainer/tests/compare_sparse_data differ diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data similarity index 100% rename from paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto rename to paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist index 8b041cd66416862a78dba27368a65860a68ef1a5..6b406dff0ba91b5f310d7eafa111c0d21d6542c3 100644 --- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist +++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist @@ -1 +1 @@ -./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto +./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data diff --git a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf new file mode 100644 index 0000000000000000000000000000000000000000..92f32a18c0068ab4672034a270aa8c52f2716d59 --- /dev/null +++ b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf @@ -0,0 +1,154 @@ +#edit-mode: -*- python -*- +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later. + +# Note: when making change to this file, please make sure +# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest +# for comparing these two nets can pass (test_CompareTwoNets) + +default_initial_std(0.1) +default_device(0) + +word_dim = 999 +l1 = 0 +l2 = 0 + +model_type("nn") + +sparse_update = get_config_arg("sparse_update", bool, False) + +TrainData(ProtoData( + type = "proto_sequence", + files = ('trainer/tests/train_sparse.list'), + )) + +Settings( + algorithm='sgd', + batch_size=100, + learning_rate=0.0001, + learning_rate_decay_a=4e-08, + learning_rate_decay_b=0.0, + learning_rate_schedule='poly', +) + + +wordvec_dim = 32 +layer2_dim = 16 +layer3_dim = 16 +hidden_dim = 32 + +slot_names = ["qb", "qw", "tb", "tw"] + +def ltr_network(network_name, + word_dim=word_dim, + wordvec_dim=wordvec_dim, + layer2_dim=layer2_dim, + layer3_dim=layer3_dim, + hidden_dim=hidden_dim, + slot_names=slot_names, + l1=l1, + l2=l2): + + slotnum = len(slot_names) + for i in xrange(slotnum): + Inputs(slot_names[i] + network_name) + for i in xrange(slotnum): + Layer( + name = slot_names[i] + network_name, + type = "data", + size = word_dim, + device = -1, + ) + Layer( + name = slot_names[i] + "_embedding_" + network_name, + type = "mixed", + size = wordvec_dim, + bias = False, + device = -1, + inputs = TableProjection(slot_names[i] + network_name, + parameter_name = "embedding.w0", + decay_rate_l1=l1, + sparse_remote_update = True, + sparse_update = sparse_update, + ), + ) + Layer( + name = slot_names[i] + "_rnn1_" + network_name, + type = "recurrent", + active_type = "tanh", + bias = Bias(initial_std = 0, + parameter_name = "rnn1.bias"), + inputs = Input(slot_names[i] + "_embedding_" + network_name, + parameter_name = "rnn1.w0") + ) + Layer( + name = slot_names[i] + "_rnnlast_" + network_name, + type = "seqlastins", + inputs = [ + slot_names[i] + "_rnn1_" + network_name, + ], + ) + + Layer( + name = "layer2_" + network_name, + type = "fc", + active_type = "tanh", + size = layer2_dim, + bias = Bias(parameter_name = "layer2.bias"), + inputs = [Input(slot_name + "_rnnlast_" + network_name, + parameter_name = "_layer2_" + slot_name + ".w", + decay_rate = l2, + initial_smart = True) for slot_name in slot_names] + ) + Layer( + name = "layer3_" + network_name, + type = "fc", + active_type = "tanh", + size = layer3_dim, + bias = Bias(parameter_name = "layer3.bias"), + inputs = [ + Input("layer2_" + network_name, + parameter_name = "_layer3.w", + decay_rate = l2, + initial_smart = True), + ] + ) + Layer( + name = "output_" + network_name, + type = "fc", + size = 1, + bias = False, + inputs = [ + Input("layer3_" + network_name, + parameter_name = "_layerO.w"), + ], + ) + + +ltr_network("left") +ltr_network("right") +Inputs("label") +Layer( + name = "label", + type = "data", + size = 1, + ) +Outputs("cost", "qb_rnnlast_left") +Layer( + name = "cost", + type = "rank-cost", + inputs = ["output_left", "output_right", "label"], + ) diff --git a/paddle/trainer/tests/simple_sparse_neural_network.py b/paddle/trainer/tests/simple_sparse_neural_network.py index 9604e1b9b45e571130c2f1bdc6d6a5fbd9c177c4..30346ef299d0bc8585ccff7f2fc4885b0d9f9dfc 100644 --- a/paddle/trainer/tests/simple_sparse_neural_network.py +++ b/paddle/trainer/tests/simple_sparse_neural_network.py @@ -1,6 +1,6 @@ from paddle.trainer_config_helpers import * -settings(batch_size=128, learning_method=AdaGradOptimizer(), learning_rate=1e-4) +settings(batch_size=17, learning_method=AdaGradOptimizer(), learning_rate=1e-4) file_list = 'trainer/tests/fake_file_list.list' @@ -12,7 +12,7 @@ define_py_data_sources2( embedding = embedding_layer( input=data_layer( - name="word_ids", size=65536), + name="word_ids", size=8191), size=128, param_attr=ParamAttr(sparse_update=True)) prediction = fc_layer(input=embedding, size=10, act=SoftmaxActivation()) diff --git a/paddle/trainer/tests/simple_sparse_neural_network_dp.py b/paddle/trainer/tests/simple_sparse_neural_network_dp.py index 8bfd1f37e7114f2dcd0798ff1e8180b111ad988f..86b272edfe1bbb23c45cffe282f6475ceaa0cc41 100644 --- a/paddle/trainer/tests/simple_sparse_neural_network_dp.py +++ b/paddle/trainer/tests/simple_sparse_neural_network_dp.py @@ -7,15 +7,15 @@ def init_hook(settings, is_train, **kwargs): @provider( - input_types={'word_ids': integer_value(65536), + input_types={'word_ids': integer_value(8191), 'label': integer_value(10)}, min_pool_size=0, init_hook=init_hook) def process(settings, filename): if settings.is_train: - data_size = 2**20 - else: data_size = 2**10 + else: + data_size = 2**5 for _ in xrange(data_size): - yield random.randint(0, 65535), random.randint(0, 9) + yield random.randint(0, 8190), random.randint(0, 9) diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp index a7000eb77e1bbeab4f6e38c0322f82bde7164080..813275518e411d6e963e23df634541f771096e0f 100644 --- a/paddle/trainer/tests/test_CompareSparse.cpp +++ b/paddle/trainer/tests/test_CompareSparse.cpp @@ -23,7 +23,7 @@ using namespace paddle; // NOLINT using namespace std; // NOLINT static const string& configFile1 = - "trainer/tests/sample_trainer_config_qb_rnn.conf"; + "trainer/tests/sample_trainer_config_compare_sparse.conf"; DECLARE_bool(use_gpu); DECLARE_string(config); diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp index 4d0174f784a0dc7314977d586c3ad1f0f9c69f6d..00ba61377aeff17d82e03f7560c0d71b3570d14f 100644 --- a/paddle/trainer/tests/test_TrainerOnePass.cpp +++ b/paddle/trainer/tests/test_TrainerOnePass.cpp @@ -100,25 +100,25 @@ TEST(average_window, gpu) { } TEST(average_window, gpu2) { - FLAGS_num_passes = 100; + FLAGS_num_passes = 20; trainerOnePassTest(configFile1, true, false, 2, 0.01); FLAGS_num_passes = 1; } TEST(average_window, gpu4) { - FLAGS_num_passes = 100; + FLAGS_num_passes = 20; trainerOnePassTest(configFile1, true, false, 4, 0.01); FLAGS_num_passes = 1; } TEST(average_window_cpu, gpu2) { - FLAGS_num_passes = 100; + FLAGS_num_passes = 20; trainerOnePassTest(configFile1, true, false, 2, 0.01, true); FLAGS_num_passes = 1; } TEST(average_window_cpu, gpu4) { - FLAGS_num_passes = 100; + FLAGS_num_passes = 20; trainerOnePassTest(configFile1, true, false, 4, 0.01, true); FLAGS_num_passes = 1; } diff --git a/paddle/trainer/tests/train_sparse.list b/paddle/trainer/tests/train_sparse.list new file mode 100644 index 0000000000000000000000000000000000000000..6ea020e2202f8464f8a647cd96c84a9d17a03ae3 --- /dev/null +++ b/paddle/trainer/tests/train_sparse.list @@ -0,0 +1 @@ +trainer/tests/compare_sparse_data diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt index 18584cafe7971bad281b498908c54780250791b7..e1cea8bd0de5394020a498725485cea025512e48 100644 --- a/proto/CMakeLists.txt +++ b/proto/CMakeLists.txt @@ -17,7 +17,7 @@ foreach(filename ${proto_filenames}) COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ARGS "--python_out=${PROJ_ROOT}/python/paddle/proto" "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL} - DEPENDS ${ABS_FIL} ${external_project_dependencies}) + DEPENDS ${ABS_FIL} protoc) endforeach() add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY}) diff --git a/proto/DataConfig.proto b/proto/DataConfig.proto index e895c184d9f95dba1449e6467a2566712837600b..0cb5d7afbb3e1cb4abe45c0ed677e09b27b870fa 100644 --- a/proto/DataConfig.proto +++ b/proto/DataConfig.proto @@ -15,14 +15,13 @@ syntax = "proto2"; package paddle; - message FileGroupConf { - optional uint32 queue_capacity = 1 [default = 1]; + optional uint32 queue_capacity = 1 [ default = 1 ]; // how many files to load for a load file thread - optional int32 load_file_count = 2 [default = 1]; + optional int32 load_file_count = 2 [ default = 1 ]; // how many threads to load files // Setting to be 5~10 is appropriate when loading files by hadoop vfs - optional int32 load_thread_num = 3 [default = 1]; + optional int32 load_thread_num = 3 [ default = 1 ]; }; message DataConfig { @@ -32,26 +31,28 @@ message DataConfig { // name of a text file which contains a list of file names at each line optional string files = 3; - optional int32 feat_dim = 4;//feature dimension of one frame - repeated int32 slot_dims = 5;//feature slot dims - optional int32 context_len = 6;//max neibour frame numbers - optional uint64 buffer_capacity = 7;//the number of samples + optional int32 feat_dim = 4; // feature dimension of one frame + repeated int32 slot_dims = 5; // feature slot dims + optional int32 context_len = 6; // max neibour frame numbers + optional uint64 buffer_capacity = 7; // the number of samples - //part of data used in training - //if not -1, part of train data is used in training - optional int64 train_sample_num = 8 [default = -1]; + // part of data used in training + // if not -1, part of train data is used in training + optional int64 train_sample_num = 8 [ default = -1 ]; - //The number of documents processed once - optional int32 file_load_num = 9 [default = -1]; - optional bool async_load_data = 12 [default = false]; + // The number of documents processed once + optional int32 file_load_num = 9 [ default = -1 ]; + optional bool async_load_data = 12 [ default = false ]; /// Note the field number 10, 11 and 13 have been deprecated. - optional bool for_test = 14 [default = false]; // whether this data is for test + optional bool for_test = 14 + [ default = false ]; // whether this data is for test optional FileGroupConf file_group_conf = 15; repeated int32 float_slot_dims = 16; /// Note the field number 17, 18 and 19 have been deprecated. - // a list of values which will be used to create additional one dimensional float + // a list of values which will be used to create additional one dimensional + // float // values slots. These one dimensional slots can be used as the weight input // for cost layers. // Currently this is only supported by ProtoDataProvider. @@ -65,21 +66,21 @@ message DataConfig { // for MultiDataProvider repeated DataConfig sub_data_configs = 24; // sub dataproviders - /* - * the ratio of each sub dataproviders: - * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100, - * then each mini-batch is combined by 10 instance from A and 90 instances - * from B. - */ + /* + * the ratio of each sub dataproviders: + * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100, + * then each mini-batch is combined by 10 instance from A and 90 instances + * from B. + */ optional int32 data_ratio = 25; /* * if one of the sub dataproviders is running out of data, then * (1) it is "main data", then finish current pass. * (2) it is not "main data", then reset it, and try getNextBatch again. */ - optional bool is_main_data = 26 [default = true]; + optional bool is_main_data = 26 [ default = true ]; - // the usage ratio of instances. Setting to 1.0 means the use of all instances. - optional double usage_ratio = 27 [default = 1.0]; + // the usage ratio of instances. Setting to 1.0 means the use of all + // instances. + optional double usage_ratio = 27 [ default = 1.0 ]; }; - diff --git a/proto/DataFormat.proto b/proto/DataFormat.proto index 19b1499b0281a1b92028cc8944c27ee4d56b8dd2..7d963bc29f7c6b9895323b0d57ba4ee4cb4387d0 100644 --- a/proto/DataFormat.proto +++ b/proto/DataFormat.proto @@ -17,27 +17,32 @@ package paddle; /* If values is not empty and ids is empty, this is a dense vector. - If values is not empty and ids is not empty, this is a sparse vector. The position of each value + If values is not empty and ids is not empty, this is a sparse vector. The + position of each value is specified by ids. - If values is empty and ids is not empty, this is a sparse vector whose non-zero values are 1. + If values is empty and ids is not empty, this is a sparse vector whose non-zero + values are 1. The position of each 1 is specified by ids. */ message VectorSlot { - repeated float values = 1 [packed = true]; - repeated uint32 ids = 2 [packed = true]; + repeated float values = 1 [ packed = true ]; + repeated uint32 ids = 2 [ packed = true ]; /* For multidimensional data, for example "image width height depth" */ - repeated uint32 dims = 3 [packed = true]; - repeated string strs = 4; + repeated uint32 dims = 3 [ packed = true ]; + repeated string strs = 4; }; /* - SubseqSlot use to record whether VectorSlot or any other slot in future has subseq. - If not all VectorSlot have subseq, we only store the one who has subseq, and use *slot_id* to record it. - One vector_slots has one sequence, and it may have N subseq, thus the number of *lens* will be N too. + SubseqSlot use to record whether VectorSlot or any other slot in future has + subseq. + If not all VectorSlot have subseq, we only store the one who has subseq, and + use *slot_id* to record it. + One vector_slots has one sequence, and it may have N subseq, thus the number of + *lens* will be N too. */ message SubseqSlot { - required uint32 slot_id = 1; //the id of slot who has subseq - repeated uint32 lens = 2; // lengths of sub-sequence in the slot + required uint32 slot_id = 1; // the id of slot who has subseq + repeated uint32 lens = 2; // lengths of sub-sequence in the slot }; message SlotDef { @@ -45,13 +50,14 @@ message SlotDef { VECTOR_DENSE = 0; VECTOR_SPARSE_NON_VALUE = 1; VECTOR_SPARSE_VALUE = 2; - INDEX = 3; // This can be used as label, or word id, etc. + INDEX = 3; // This can be used as label, or word id, etc. VAR_MDIM_DENSE = 4; VAR_MDIM_INDEX = 5; STRING = 6; } required SlotType type = 1; - required uint32 dim = 2; // For INDEX slots, this means the maximal index plus 1. + required uint32 dim = + 2; // For INDEX slots, this means the maximal index plus 1. }; message DataHeader { @@ -60,11 +66,11 @@ message DataHeader { }; message DataSample { - optional bool is_beginning = 1 [default = true]; // is the beginning of a sequence + optional bool is_beginning = 1 + [ default = true ]; // is the beginning of a sequence repeated VectorSlot vector_slots = 2; - repeated uint32 id_slots = 3 [packed = true]; + repeated uint32 id_slots = 3 [ packed = true ]; /* use ids of VectorSlot */ repeated VectorSlot var_id_slots = 4; repeated SubseqSlot subseq_slots = 5; }; - diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index 3bee5b572ae42750332b69e28af980ae325532da..4f3d5bf3f6cb96c97285f40e3a3d100c2af47ad5 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -21,7 +21,6 @@ package paddle; * Various structs for the configuration of a neural network */ - message ExternalConfig { repeated string layer_names = 1; repeated string input_layer_names = 2; @@ -68,7 +67,7 @@ message ConvConfig { required uint32 img_size = 8; // caffe mode for output size coherence - required bool caffe_mode = 9 [default = true]; + required bool caffe_mode = 9 [ default = true ]; // if filter_size_y is set , this convolutional layer will use // filters of size filter_size * filter_size_y pixels. @@ -99,7 +98,7 @@ message PoolConfig { optional uint32 start = 4; // Defines the stride size between successive pooling squares. - required uint32 stride = 5 [default = 1]; + required uint32 stride = 5 [ default = 1 ]; // The size of output feature map. required uint32 output_x = 6; @@ -109,7 +108,7 @@ message PoolConfig { // padding = 4, instructs the net to implicitly // pad the images with a 4-pixel border of zeros. - optional uint32 padding = 8 [default = 0]; + optional uint32 padding = 8 [ default = 0 ]; // if not set, use size_x optional uint32 size_y = 9; @@ -194,9 +193,7 @@ message MaxOutConfig { required uint32 groups = 2; } -message RowConvConfig { - required uint32 context_length = 1; -} +message RowConvConfig { required uint32 context_length = 1; } message SliceConfig { required uint32 start = 1; @@ -212,14 +209,14 @@ message ProjectionConfig { // For ShiftProjection optional int32 context_start = 5; optional int32 context_length = 6; - optional bool trainable_padding = 7 [default = false]; + optional bool trainable_padding = 7 [ default = false ]; // For convolution optional ConvConfig conv_conf = 8; optional int32 num_filters = 9; // For IdentityOffsetProjection - optional uint64 offset = 11 [default = 0]; + optional uint64 offset = 11 [ default = 0 ]; // For pool optional PoolConfig pool_conf = 12; @@ -236,7 +233,7 @@ message OperatorConfig { required uint64 output_size = 4; // For DotMulOperator - optional double dotmul_scale = 5 [default = 1.0]; + optional double dotmul_scale = 5 [ default = 1.0 ]; // For ConvOperator optional ConvConfig conv_conf = 6; @@ -282,8 +279,8 @@ message MultiBoxLossConfig { required float neg_overlap = 4; required uint32 background_id = 5; required uint32 input_num = 6; - optional uint32 height = 7 [default = 1]; - optional uint32 width = 8 [default = 1]; + optional uint32 height = 7 [ default = 1 ]; + optional uint32 width = 8 [ default = 1 ]; } message DetectionOutputConfig { @@ -294,8 +291,13 @@ message DetectionOutputConfig { required uint32 input_num = 5; required uint32 keep_top_k = 6; required float confidence_threshold = 7; - optional uint32 height = 8 [default = 1]; - optional uint32 width = 9 [default = 1]; + optional uint32 height = 8 [ default = 1 ]; + optional uint32 width = 9 [ default = 1 ]; +} + +message ClipConfig { + required double min = 1; + required double max = 2; } message LayerInputConfig { @@ -318,6 +320,7 @@ message LayerInputConfig { optional RowConvConfig row_conv_conf = 15; optional MultiBoxLossConfig multibox_loss_conf = 16; optional DetectionOutputConfig detection_output_conf = 17; + optional ClipConfig clip_conf = 18; } message LayerConfig { @@ -325,7 +328,7 @@ message LayerConfig { required string name = 1; required string type = 2; optional uint64 size = 3; - //optional ActivationConfig activation = 4; + // optional ActivationConfig activation = 4; optional string active_type = 4; repeated LayerInputConfig inputs = 5; optional string bias_parameter_name = 6; @@ -338,7 +341,7 @@ message LayerConfig { // (which is how convnets are usually trained). Setting this to // false will untie the biases, yielding a separate bias for // every location at which the filter is applied. - optional bool shared_biases = 8 [default = false]; + optional bool shared_biases = 8 [ default = false ]; // Valid values are ones that divide the area of the output // grid in this convolutional layer. For example if this layer @@ -356,33 +359,35 @@ message LayerConfig { // the gpu device which the Layer's data in. // Only used by ParallelNeuralNetork. Ignored otherwise. - optional int32 device = 12 [default = -1]; + optional int32 device = 12 [ default = -1 ]; - // for recurrent layer. If true, the recurrence runs from the end to the beginning. - optional bool reversed = 13 [default = false]; + // for recurrent layer. If true, the recurrence runs from the end to the + // beginning. + optional bool reversed = 13 [ default = false ]; - // for lstmemory layer. Different types of nodes have different activation type. - optional string active_gate_type = 14; + // for lstmemory layer. Different types of nodes have different activation + // type. + optional string active_gate_type = 14; optional string active_state_type = 15; // For NCELayer // The number of random negative labels for each sample - optional int32 num_neg_samples = 16 [default = 10]; + optional int32 num_neg_samples = 16 [ default = 10 ]; // For NCELayer // The distribution for generating the random negative labels. // A uniform distribution will be used if not provided - repeated double neg_sampling_dist = 17 [packed = true]; + repeated double neg_sampling_dist = 17 [ packed = true ]; // For MaxLayer // default: output VALUE of MaxLayer. set this flag to true for output INDEX // INDEX will be put in Argument::value as double values. - optional bool output_max_index = 19 [default = false]; + optional bool output_max_index = 19 [ default = false ]; /// The filed number 20 have been deprecated. // For self-normalized estimation - optional double softmax_selfnorm_alpha = 21 [default = 0.1]; + optional double softmax_selfnorm_alpha = 21 [ default = 0.1 ]; /// The filed numbers 22 and 23 have been deprecated. @@ -393,14 +398,14 @@ message LayerConfig { optional bool norm_by_times = 25; // for CostLayers - optional double coeff = 26 [default = 1.0]; + optional double coeff = 26 [ default = 1.0 ]; // for AverageLayer // can be set to: 'average', 'sum' or 'squarerootn' optional string average_strategy = 27; // for error clipping - optional double error_clipping_threshold = 28 [default = 0.0]; + optional double error_clipping_threshold = 28 [ default = 0.0 ]; // for operators used by mixed layer repeated OperatorConfig operator_confs = 29; @@ -428,43 +433,44 @@ message LayerConfig { optional uint32 beam_size = 39; // for seqlastins layer, whether select first instead last - optional bool select_first = 40 [default = false]; + optional bool select_first = 40 [ default = false ]; // for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer // can be set to: 'non-seq','seq' - optional string trans_type = 41 [default = 'non-seq']; + optional string trans_type = 41 [ default = 'non-seq' ]; // to indicate whether selective_fc layer // is used in sequence generation or not - optional bool selective_fc_pass_generation = 42 [default = false]; + optional bool selective_fc_pass_generation = 42 [ default = false ]; // to indicate whether selective_fc layer take its last input to // selected several columns and only compute the multiplications // between the input matrices and the selected columns of // the parameter matrices of this layer. // if set false, selective_fc degrades into fc. - optional bool has_selected_colums = 43 [default = true]; + optional bool has_selected_colums = 43 [ default = true ]; // this parameter is for speed consideration. // if number of the selected columns is less than // sample number * selective_fc output size * selective_fc_mull_mull_ratio // sparse multiplication is used, otherwise, using full multiplication. - optional double selective_fc_full_mul_ratio = 44 [default = 0.02]; + optional double selective_fc_full_mul_ratio = 44 [ default = 0.02 ]; // to indicate how many threads selective_fc use to to accelate // the plain_mul period // leave empty or set to 0 to disable multi-thread accleleration - optional uint32 selective_fc_parallel_plain_mul_thread_num = 45 [default = 0]; + optional uint32 selective_fc_parallel_plain_mul_thread_num = 45 + [ default = 0 ]; // for batch normalization layer // if set use_global_stats true, will use the loaded mean and variance. optional bool use_global_stats = 46; // use to compute moving mean and variance. - optional double moving_average_fraction = 47 [default = 0.9]; + optional double moving_average_fraction = 47 [ default = 0.9 ]; // bias size - optional uint32 bias_size = 48 [default = 0]; + optional uint32 bias_size = 48 [ default = 0 ]; // this parameter can be used as a user-defined parameter when necessary, // without changing the proto file. @@ -479,18 +485,17 @@ message LayerConfig { optional uint64 width = 51; // blank label used in ctc loss - optional uint32 blank = 52 [default = 0]; + optional uint32 blank = 52 [ default = 0 ]; // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which // controls the scope of pooling operation. can be set > 0. // leave empty or set to -1 to disable this stride pooling. - optional int32 seq_pool_stride = 53 [default = -1]; + optional int32 seq_pool_stride = 53 [ default = -1 ]; // for crop layer - optional int32 axis = 54 [default = 2]; + optional int32 axis = 54 [ default = 2 ]; repeated uint32 offset = 55; repeated uint32 shape = 56; - } message EvaluatorConfig { @@ -506,9 +511,9 @@ message EvaluatorConfig { // Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator // For multi binary labels: true if output > classification_threshold - optional double classification_threshold = 6 [default = 0.5]; + optional double classification_threshold = 6 [ default = 0.5 ]; // The positive label. -1 means average precision and recall - optional int32 positive_label = 7 [default = -1]; + optional int32 positive_label = 7 [ default = -1 ]; // load dict from this file optional string dict_file = 8; @@ -517,10 +522,10 @@ message EvaluatorConfig { optional string result_file = 9; // top # results for max id printer - optional int32 num_results = 10 [default = 1]; + optional int32 num_results = 10 [ default = 1 ]; // whether to delimit the sequence in the seq_text_printer - optional bool delimited = 11 [default = true]; + optional bool delimited = 11 [ default = true ]; // Used by ChunkEvaluator // chunk of these types are not counted @@ -528,23 +533,23 @@ message EvaluatorConfig { // Used by ClassificationErrorEvaluator // top # classification error - optional int32 top_k = 13 [default = 1]; + optional int32 top_k = 13 [ default = 1 ]; // Used by DetectionMAPEvaluator - optional double overlap_threshold = 14 [default = 0.5]; + optional double overlap_threshold = 14 [ default = 0.5 ]; - optional int32 background_id = 15 [default = 0]; + optional int32 background_id = 15 [ default = 0 ]; - optional bool evaluate_difficult = 16 [default = false]; + optional bool evaluate_difficult = 16 [ default = false ]; - optional string ap_type = 17 [default = "11point"]; + optional string ap_type = 17 [ default = "11point" ]; } message LinkConfig { required string layer_name = 1; required string link_name = 2; // If true, this link has sub-sequence - optional bool has_subseq = 3 [default = false]; + optional bool has_subseq = 3 [ default = false ]; } message MemoryConfig { @@ -557,18 +562,18 @@ message MemoryConfig { optional uint32 boot_with_const_id = 7; // memory is a sequence, initailized by a sequence boot layer - optional bool is_sequence = 6 [default = false]; + optional bool is_sequence = 6 [ default = false ]; } message GeneratorConfig { required uint32 max_num_frames = 1; required string eos_layer_name = 2; - optional int32 num_results_per_sample = 3 [default = 1]; + optional int32 num_results_per_sample = 3 [ default = 1 ]; // for beam search - optional int32 beam_size = 4 [default = 1]; + optional int32 beam_size = 4 [ default = 1 ]; - optional bool log_prob = 5 [default = true]; + optional bool log_prob = 5 [ default = true ]; } message SubModelConfig { @@ -578,10 +583,10 @@ message SubModelConfig { repeated string output_layer_names = 4; repeated string evaluator_names = 5; - optional bool is_recurrent_layer_group = 6 [default = false]; + optional bool is_recurrent_layer_group = 6 [ default = false ]; // If true, the recurrence runs from the end to the beginning. - optional bool reversed = 7 [default = false]; + optional bool reversed = 7 [ default = false ]; // name and link name of memory repeated MemoryConfig memories = 8; @@ -595,14 +600,15 @@ message SubModelConfig { optional GeneratorConfig generator = 11; - // the id of inlink which share info with outlinks, used in recurrent layer group + // the id of inlink which share info with outlinks, used in recurrent layer + // group optional int32 target_inlinkid = 12; } message ModelConfig { // type of the model. // Currently, "nn", "recurrent_nn" and "recursive_nn" are supported - required string type = 1 [default = "nn"]; + required string type = 1 [ default = "nn" ]; // layers should be ordered in such a way that the forward propagation // can be correctly executed by going from the first layer to the last layer diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto index 2a87e293f64d3398dea2641c3ff292eceec7e154..d27b1bcf80045216a5807812d39f7a248a956076 100644 --- a/proto/OptimizerConfig.proto +++ b/proto/OptimizerConfig.proto @@ -1,5 +1,5 @@ syntax = "proto2"; - + option optimize_for = LITE_RUNTIME; package paddle; @@ -9,13 +9,11 @@ message SGDConfig { // momentum: float >= 0. Parameter updates momentum. // decay: float >= 0. Learning rate decay over each update. // nesterov: boolean. Whether to apply Nesterov momentum. - optional double momentum = 21 [default = 0.0]; - optional double decay = 23 [default = 0.0]; - optional bool nesterov =24 [default = false]; - + optional double momentum = 21 [ default = 0.0 ]; + optional double decay = 23 [ default = 0.0 ]; + optional bool nesterov = 24 [ default = false ]; } - message AdadeltaConfig { // Adadelta // It is recommended to leave it at the default value. @@ -23,21 +21,23 @@ message AdadeltaConfig { // epsilon: float >= 0. Fuzz factor. // decay: float >= 0. Learning rate decay over each update. - // reference : [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701) - optional double rho = 33 [default = 0.90]; - optional double epsilon = 31 [default = 1e-5]; - optional double decay = 32 [default = 0.0]; - + // reference : [Adadelta - an adaptive learning rate + // method](http://arxiv.org/abs/1212.5701) + optional double rho = 33 [ default = 0.90 ]; + optional double epsilon = 31 [ default = 1e-5 ]; + optional double decay = 32 [ default = 0.0 ]; } message AdagradConfig { -// Adagrad -// epsilon: float >= 0. -// decay: float >= 0. Learning rate decay over each update. + // Adagrad + // epsilon: float >= 0. + // decay: float >= 0. Learning rate decay over each update. -// reference : [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) - optional double epsilon = 41 [default = 1e-5]; - optional double decay = 42 [default = 0.0]; + // reference : [Adaptive Subgradient Methods for Online Learning and + // Stochastic + // Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) + optional double epsilon = 41 [ default = 1e-5 ]; + optional double decay = 42 [ default = 0.0 ]; } message AdamConfig { @@ -46,7 +46,8 @@ message AdamConfig { // beta_2: float, 0 < beta < 1. Generally close to 1. // epsilon: float >= 0. Fuzz factor. // decay: float >= 0. Learning rate decay over each update. - // reference : [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) + // reference : [Adam - A Method for Stochastic + // Optimization](http://arxiv.org/abs/1412.6980v8) optional double beta_1 = 41; optional double beta_2 = 42; optional double epsilon = 43; @@ -55,32 +56,32 @@ message AdamConfig { message ConstLrConfig { // learninRate Policy - optional double learning_rate = 1 [default = 1.0]; + optional double learning_rate = 1 [ default = 1.0 ]; } message LinearLrConfig { // learninRate Policy - optional double learning_rate = 1 [default = 1.0]; + optional double learning_rate = 1 [ default = 1.0 ]; optional double lr_decay_a = 2; optional double lr_decay_b = 3; } message TensorProto { -enum DataType { - PADDLE_ELEMENT_TYPE_INT32 = 0; - PADDLE_ELEMENT_TYPE_UINT32 = 1; - PADDLE_ELEMENT_TYPE_INT64 = 2; - PADDLE_ELEMENT_TYPE_UINT64 = 3; - PADDLE_ELEMENT_TYPE_FLOAT32 = 4; - PADDLE_ELEMENT_TYPE_FLOAT64 = 5; -} + enum DataType { + PADDLE_ELEMENT_TYPE_INT32 = 0; + PADDLE_ELEMENT_TYPE_UINT32 = 1; + PADDLE_ELEMENT_TYPE_INT64 = 2; + PADDLE_ELEMENT_TYPE_UINT64 = 3; + PADDLE_ELEMENT_TYPE_FLOAT32 = 4; + PADDLE_ELEMENT_TYPE_FLOAT64 = 5; + } optional DataType data_type = 1; repeated bytes content = 2; } message LrPolicyState { // learninRate Policy - optional double learning_rate = 1 [default = 1.0]; + optional double learning_rate = 1 [ default = 1.0 ]; optional double lr_decay_a = 2; optional double lr_decay_b = 3; } @@ -104,7 +105,6 @@ message AdadeltaOptimizerState { optional TensorProto update_delta = 4; } - message AdagradOptimizerState { optional LrPolicyState lr_state = 101; optional double num_sample_passed = 104; @@ -124,10 +124,10 @@ message AdamOptimizerState { message OptimizerConfig { enum Optimizer { - SGD = 1; - Adadelta = 2; - Adagrad = 3; - Adam = 4; + SGD = 1; + Adadelta = 2; + Adagrad = 3; + Adam = 4; } optional Optimizer optimizer = 1; optional SGDConfig sgd = 3; @@ -136,8 +136,8 @@ message OptimizerConfig { optional AdamConfig adam = 6; enum LrPolicy { - Const = 0; - Linear = 1; + Const = 0; + Linear = 1; } optional LrPolicy lr_policy = 11; optional ConstLrConfig const_lr = 12; diff --git a/proto/ParameterConfig.proto b/proto/ParameterConfig.proto index 580d66324602df4c655dd2f1e1cd87159b5b346b..b13570a2c6e7b16e45892a31bb496a9dd2099df0 100644 --- a/proto/ParameterConfig.proto +++ b/proto/ParameterConfig.proto @@ -27,56 +27,57 @@ enum ParameterInitStrategy { message ParameterUpdaterHookConfig { // hook type such as 'pruning' required string type = 1; - // this represents the ratio of zero element to be set by the Parameter - optional double sparsity_ratio = 2 [default = 0.6]; + // this represents the ratio of zero element to be set by the Parameter + optional double sparsity_ratio = 2 [ default = 0.6 ]; } message ParameterConfig { required string name = 1; required uint64 size = 2; - optional double learning_rate = 3 [default = 1.0]; - optional double momentum = 4 [default = 0.0]; - optional double initial_mean = 5 [default = 0.0]; - optional double initial_std = 6 [default = 0.01]; + optional double learning_rate = 3 [ default = 1.0 ]; + optional double momentum = 4 [ default = 0.0 ]; + optional double initial_mean = 5 [ default = 0.0 ]; + optional double initial_std = 6 [ default = 0.01 ]; // use L2-regularization if decay_rate set and decay_rate_l1 not set - optional double decay_rate = 7 [default = 0.0]; + optional double decay_rate = 7 [ default = 0.0 ]; // use L1-regularization if decay_rate_l1 set - optional double decay_rate_l1 = 8 [default = 0.0]; + optional double decay_rate_l1 = 8 [ default = 0.0 ]; // dims of Parameter, e.g. dims[0] as height, dims[1] as width.. repeated uint64 dims = 9; // the gpu device which the parameter in. // Only used by ParallelNeuralNetork. Ignored otherwise. - optional int32 device = 10 [default = -1]; + optional int32 device = 10 [ default = -1 ]; // how to init the parameter: 0 -> normal, 1 -> uniform // 0: treat initial_mean as mean, intial_std as standard deviation // 1: range is (initial_mean - initial_std) to (initial_mean + initial_std) - optional int32 initial_strategy = 11 [default = 0]; + optional int32 initial_strategy = 11 [ default = 0 ]; // define the variance when init the parameter, by height of the Matrix - optional bool initial_smart = 12 [default = false]; + optional bool initial_smart = 12 [ default = false ]; // apply regularization every # batches - optional int32 num_batches_regularization = 13 [default = 1]; + optional int32 num_batches_regularization = 13 [ default = 1 ]; // if is_sparse is true, para is sparse, else para is dense - optional bool is_sparse = 14[default = false]; - // if para is sparse, format should be "csc" or "csr", empty means is not sparse - optional string format = 15 [default = ""]; + optional bool is_sparse = 14 [ default = false ]; + // if para is sparse, format should be "csc" or "csr", empty means is not + // sparse + optional string format = 15 [ default = "" ]; // sparse remote update or not - optional bool sparse_remote_update = 16 [default = false]; + optional bool sparse_remote_update = 16 [ default = false ]; // gradient clipping threshold, no clipping by default - optional double gradient_clipping_threshold = 17 [default = 0.0]; + optional double gradient_clipping_threshold = 17 [ default = 0.0 ]; // static parameters are fixed when training - optional bool is_static = 18 [default = false]; + optional bool is_static = 18 [ default = false ]; // para_id should NOT be set by config_parser. It is for // internal use. optional uint64 para_id = 19; repeated ParameterUpdaterHookConfig update_hooks = 20; // setup load mat -> csr - optional bool need_compact = 21 [default = false]; + optional bool need_compact = 21 [ default = false ]; // whether to do sparse update for this parameter - optional bool sparse_update = 22 [default = false]; + optional bool sparse_update = 22 [ default = false ]; // whether this parameter is shared or not. - optional bool is_shared = 23 [default = false]; + optional bool is_shared = 23 [ default = false ]; // parameter block size - optional uint64 parameter_block_size = 24 [default = 0]; + optional uint64 parameter_block_size = 24 [ default = 0 ]; } diff --git a/proto/ParameterServerConfig.proto b/proto/ParameterServerConfig.proto index 404f9613792653dda72eeb98f022851adedbfbfd..bd63cf35b1483a45f21de6f0d0d883e4d8432296 100644 --- a/proto/ParameterServerConfig.proto +++ b/proto/ParameterServerConfig.proto @@ -15,13 +15,10 @@ syntax = "proto2"; package paddle; - /** * Configuration structure for ParameterClient2. */ -message ParameterClientConfig { - required int32 trainer_id = 1; -} +message ParameterClientConfig { required int32 trainer_id = 1; } /** * Configuration structure for ParameterServer2. @@ -30,24 +27,24 @@ message ParameterServerConfig { // Number of ports for sending dense parameter, // following ports on parameter server will be visited // for sending dense parameter: [port, port+ports_num-1] - required int32 ports_num = 1 [default = 1]; + required int32 ports_num = 1 [ default = 1 ]; // Number of ports for sending sparse parameter, // following ports on parameter server will be visited // for sending sparse parameter: // [port+ports_num, port+ports_num+ports_num_for_sparse-1] - required int32 ports_num_for_sparse = 2 [default = 0]; + required int32 ports_num_for_sparse = 2 [ default = 0 ]; // network device name for pservers - required string nics = 3 [default = "xgbe0,xgbe1"]; - required string rdma_tcp = 4 [default = "tcp"]; + required string nics = 3 [ default = "xgbe0,xgbe1" ]; + required string rdma_tcp = 4 [ default = "tcp" ]; // Listening port for pserver - required int32 port = 5 [default = 20134]; + required int32 port = 5 [ default = 20134 ]; // number of gradient servers - required int32 num_gradient_servers = 6 [default = 1]; + required int32 num_gradient_servers = 6 [ default = 1 ]; // number of threads for sync op exec - required int32 pserver_num_threads = 7 [default = 1]; + required int32 pserver_num_threads = 7 [ default = 1 ]; // control config_.async_lagged_grad_discard_ratio() min value - required double async_lagged_ratio_min = 8 [default = 1.0]; + required double async_lagged_ratio_min = 8 [ default = 1.0 ]; // if async_lagged_grad_discard_ratio is not set in trainer_config.conf // use it as defalut value - required double async_lagged_ratio_default = 9 [default = 1.5]; + required double async_lagged_ratio_default = 9 [ default = 1.5 ]; } \ No newline at end of file diff --git a/proto/ParameterService.proto b/proto/ParameterService.proto index c1c04d8cc5bdedd09173d5dfa10b82c7ee7ed6a4..e3c180ccc3f2a9bfa13c443944cc5ae3398818a9 100644 --- a/proto/ParameterService.proto +++ b/proto/ParameterService.proto @@ -23,8 +23,8 @@ package paddle; */ enum ParameterUpdateMode { // Set parameter - PSERVER_UPDATE_MODE_SET_PARAM = 0;//use local param - PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1;//set zero param + PSERVER_UPDATE_MODE_SET_PARAM = 0; // use local param + PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1; // set zero param // Update parameter once a gradient is received PSERVER_UPDATE_MODE_ASYNC_SGD = 2; @@ -37,7 +37,7 @@ enum ParameterUpdateMode { // No update. Only get parameters back. PSERVER_UPDATE_MODE_GET_PARAM = 5; - PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6;//only get sparse rows + PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6; // only get sparse rows }; message ParameterBlock { @@ -80,42 +80,34 @@ message SendParameterRequest { optional int32 trainer_id = 7; // send back parameter type on pserver, PARAMETER_VALUE by default - optional int32 send_back_parameter_type = 8 [default = 0]; + optional int32 send_back_parameter_type = 8 [ default = 0 ]; // forwardbackward time in usec optional uint64 forwardbackward_time = 9; - } -message WaitPassStartRequest { -} +message WaitPassStartRequest {} -message WaitPassStartResponse { -} +message WaitPassStartResponse {} -message WaitPassFinishRequest { -} +message WaitPassFinishRequest {} -message WaitPassFinishResponse { -} +message WaitPassFinishResponse {} enum SyncObject { SYNC_DEFAULT = 0; // wait for the synchronizeBarrier_ - SYNC_DATA = 1; // wait for the synchronizeDataBarrier_ + SYNC_DATA = 1; // wait for the synchronizeDataBarrier_ } message SynchronizeRequest { - required SyncObject sync_object_id = 1 [default = SYNC_DEFAULT]; + required SyncObject sync_object_id = 1 [ default = SYNC_DEFAULT ]; optional int32 trainer_id = 2; } -message SynchronizeResponse { -} +message SynchronizeResponse {} -message SendParameterResponse { - repeated ParameterBlock blocks = 1; -} +message SendParameterResponse { repeated ParameterBlock blocks = 1; } message SetConfigRequest { repeated ParameterConfig param_configs = 1; @@ -125,26 +117,18 @@ message SetConfigRequest { required bool is_sparse_server = 6; } -message SetConfigResponse{ -} +message SetConfigResponse {} -message GetStatusRequest { -} +message GetStatusRequest {} -message GetStatusResponse { - required PServerStatus status = 1; -} +message GetStatusResponse { required PServerStatus status = 1; } -message SetStatusRequest { - required PServerStatus status = 1; -} +message SetStatusRequest { required PServerStatus status = 1; } -message SetStatusResponse { -} +message SetStatusResponse {} // create a column vector. The size is the dimension of parameter -message CreateVectorRequest { -} +message CreateVectorRequest {} message CreateVectorResponse { // error message. Empty if success @@ -153,9 +137,7 @@ message CreateVectorResponse { required int64 handle = 2; } -message ReleaseVectorRequest { - required int64 handle = 1; -} +message ReleaseVectorRequest { required int64 handle = 1; } message ReleaseVectorResponse { // error message. Empty if success @@ -164,9 +146,7 @@ message ReleaseVectorResponse { // Create a column major matrix. The number of rows is the dimension // of parameter. The number of columns is specifed by num_cols -message CreateMatrixRequest { - required int32 num_cols = 1; -} +message CreateMatrixRequest { required int32 num_cols = 1; } message CreateMatrixResponse { // error message. Empty if success @@ -175,16 +155,13 @@ message CreateMatrixResponse { required int64 handle = 2; } -message ReleaseMatrixRequest { - required int64 handle = 1; -} +message ReleaseMatrixRequest { required int64 handle = 1; } message ReleaseMatrixResponse { // error message. Empty if success optional string return_message = 1; } - /** * The operations are defined using the variables commented at Operation * and OperationResult @@ -245,36 +222,36 @@ enum MatrixVectorOperation { message ProtoVector { required int64 dim = 1; - repeated double values = 2 [packed = true]; + repeated double values = 2 [ packed = true ]; } message ProtoMatrix { required int64 num_rows = 1; required int64 num_cols = 2; - repeated double values = 3 [packed = true]; + repeated double values = 3 [ packed = true ]; } message Operation { required MatrixVectorOperation operation = 1; // vector handles created on the pserver - repeated int64 pvectors = 2; // u, v, w + repeated int64 pvectors = 2; // u, v, w // matrix handles created on the pserver - repeated int64 pmatrices = 3; // A, B, C + repeated int64 pmatrices = 3; // A, B, C - repeated double scalars = 4; // a, b, c - repeated ProtoVector vectors = 5; // x, y, z - repeated ProtoMatrix matrices = 6; // X, Y, Z + repeated double scalars = 4; // a, b, c + repeated ProtoVector vectors = 5; // x, y, z + repeated ProtoMatrix matrices = 6; // X, Y, Z } message OperationResult { // error message. Empty if success optional string return_message = 1; -// - repeated double scalars = 2; // d, e, f + // + repeated double scalars = 2; // d, e, f repeated ProtoVector vectors = 3; // p, q, r - repeated ProtoMatrix matrices = 4; // P, Q, R + repeated ProtoMatrix matrices = 4; // P, Q, R } message DoOperationRequest { @@ -301,18 +278,14 @@ message DoOperationResponse { required bool pass_finish = 3; } -message LoadValueRequest { - required string dir_name = 1; -} +message LoadValueRequest { required string dir_name = 1; } message LoadValueResponse { // error message. Empty if success optional string return_message = 1; } -message SaveValueRequest { - required string dir_name = 1; -} +message SaveValueRequest { required string dir_name = 1; } message SaveValueResponse { // error message. Empty if success @@ -331,11 +304,11 @@ enum DataUpdateMode { // Client send it's own ref label to pserver DATA_UPDATE_MODE_SET_REF_LABEL = 4; // Client get all ref labels from all pservers - DATA_UPDATE_MODE_GET_REF_LABEL =5; + DATA_UPDATE_MODE_GET_REF_LABEL = 5; // Client send it's own ref grad to pserver - DATA_UPDATE_MODE_SET_REF_GRAD =6; + DATA_UPDATE_MODE_SET_REF_GRAD = 6; // Client get all ref grad from all pservers - DATA_UPDATE_MODE_GET_REF_GRAD =7; + DATA_UPDATE_MODE_GET_REF_GRAD = 7; } enum SendDataType { @@ -360,7 +333,7 @@ message DataBlock { // byte size of one data type required int32 data_size = 2; // data_type - optional TransDataType data_type = 3 [default = TRANS_DOUBLE]; + optional TransDataType data_type = 3 [ default = TRANS_DOUBLE ]; } message SendDataRequest { diff --git a/proto/TrainerConfig.proto b/proto/TrainerConfig.proto index a819d20d11ff3932d331801007b8cfb9c77a3f2b..b7c2355159e66be0a1550d3c8fde9a15346ff7e4 100644 --- a/proto/TrainerConfig.proto +++ b/proto/TrainerConfig.proto @@ -20,14 +20,14 @@ package paddle; message OptimizationConfig { required int32 batch_size = 3; - required string algorithm = 4 [default = "async_sgd"]; - optional int32 num_batches_per_send_parameter = 5 [default = 1]; - optional int32 num_batches_per_get_parameter = 6 [default = 1]; + required string algorithm = 4 [ default = "async_sgd" ]; + optional int32 num_batches_per_send_parameter = 5 [ default = 1 ]; + optional int32 num_batches_per_get_parameter = 6 [ default = 1 ]; required double learning_rate = 7; - optional double learning_rate_decay_a = 8 [default = 0]; - optional double learning_rate_decay_b = 9 [default = 0]; - optional string learning_rate_schedule = 27 [default = "constant"]; + optional double learning_rate_decay_a = 8 [ default = 0 ]; + optional double learning_rate_decay_b = 9 [ default = 0 ]; + optional string learning_rate_schedule = 27 [ default = "constant" ]; // learning rate will be scaled according to learning_rate_schedule // 1), constant: // lr = learning_rate @@ -49,88 +49,92 @@ message OptimizationConfig { // owlqn related // L1-regularization - optional double l1weight = 10 [default = 0.1]; + optional double l1weight = 10 [ default = 0.1 ]; // L2-regularization - optional double l2weight = 11 [default = 0]; + optional double l2weight = 11 [ default = 0 ]; // "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step) // then accept the step - optional double c1 = 12 [default = 0.0001]; + optional double c1 = 12 [ default = 0.0001 ]; // multiply the step with "backoff", when wolfe condition doesn't satisfy - optional double backoff = 13 [default = 0.5]; + optional double backoff = 13 [ default = 0.5 ]; // how many "s"s and "y"s are kept in owlqn - optional int32 owlqn_steps = 14 [default = 10]; + optional int32 owlqn_steps = 14 [ default = 10 ]; // accept the step if encountered "max_backoff" times of "reduce the step" - optional int32 max_backoff = 15 [default = 5]; + optional int32 max_backoff = 15 [ default = 5 ]; // L2-regularization coefficient is reduced linearly from iteration 0 to // "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter" // iterations. set "l2weight_zero_iter" to 0 to disable this strategy. - optional int32 l2weight_zero_iter = 17 [default = 0]; + optional int32 l2weight_zero_iter = 17 [ default = 0 ]; // averaged sgd // About average_window * numBatchProcessed parameter are used // for average. To be accurate, between average_window * numBatchProcessed // and 2 * average_window * numBatchProcessed parameters are used for // average. - optional double average_window = 18 [default = 0]; - optional int64 max_average_window = 19 [default = 0x7fffffffffffffff]; + optional double average_window = 18 [ default = 0 ]; + optional int64 max_average_window = 19 [ default = 0x7fffffffffffffff ]; ////////////////////////// // Options Adaptive SGD // ////////////////////////// - // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta", "rmsprop" - // default learning method("momentum") use global decayed learning rate with momentum. + // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta", + // "rmsprop" + // default learning method("momentum") use global decayed learning rate with + // momentum. // "adagrad", "adadelta" and "rmsprop" can set momentum too. - optional string learning_method = 23 [default = "momentum"]; - optional double ada_epsilon = 24 [default = 1e-6]; - optional double ada_rou = 26 [default = 0.95]; + optional string learning_method = 23 [ default = "momentum" ]; + optional double ada_epsilon = 24 [ default = 1e-6 ]; + optional double ada_rou = 26 [ default = 0.95 ]; // Force to do average in cpu in order to save gpu memory usage - optional bool do_average_in_cpu = 25 [default = false]; + optional bool do_average_in_cpu = 25 [ default = false ]; // delta add rate in pserver, used while num_batches_per_send_parameter>1 // will be divided by #machines automatically. - optional double delta_add_rate = 28 [default = 1.0]; + optional double delta_add_rate = 28 [ default = 1.0 ]; // We split a large size into smaller mini-batches, whose sizes are // determined by mini_batch_size. It only takes effect when there is // an ExternalMachine. - optional int32 mini_batch_size = 29 [default = 128]; + optional int32 mini_batch_size = 29 [ default = 128 ]; // automatically set if any one of parameters set sparse remote update flag - optional bool use_sparse_remote_updater = 30 [default = false]; + optional bool use_sparse_remote_updater = 30 [ default = false ]; - // how to update center parameter and feedback to local parameter, + // how to update center parameter and feedback to local parameter, // when use local sgd update in cluster training. - // A option is elastic_average, proposed by the paper: Deep learning with elastic averaging SGD. - // If use elastic_average method, every trainer node should sample from whole data sets. - optional string center_parameter_update_method = 31 [default = "average"]; + // A option is elastic_average, proposed by the paper: Deep learning with + // elastic averaging SGD. + // If use elastic_average method, every trainer node should sample from whole + // data sets. + optional string center_parameter_update_method = 31 [ default = "average" ]; // shrink sparse parameter value // only works if parameter is remote sparse update and has L1 decay rate - optional double shrink_parameter_value = 32 [default = 0]; + optional double shrink_parameter_value = 32 [ default = 0 ]; //////////////////////////// // Options Adam Optimizer // //////////////////////////// - optional double adam_beta1 = 33 [default = 0.9]; - optional double adam_beta2 = 34 [default = 0.999]; - optional double adam_epsilon = 35 [default = 1e-8]; + optional double adam_beta1 = 33 [ default = 0.9 ]; + optional double adam_beta2 = 34 [ default = 0.999 ]; + optional double adam_epsilon = 35 [ default = 1e-8 ]; // arguments for learning rate scheduler // Format: num1:rate1,num2:rate2,...,numK:rateK // For learning_rate_schedule="manual", num is the number of samples, // For learning_rate_schedule="pass_manual", // num is the number of passes (starting from 0) - optional string learning_rate_args = 36 [default = ""]; - + optional string learning_rate_args = 36 [ default = "" ]; + // for async sgd gradient commit control. // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed, // current async gradient will be discard silently. - optional double async_lagged_grad_discard_ratio = 37 [default = 1.5]; + optional double async_lagged_grad_discard_ratio = 37 [ default = 1.5 ]; - // global threshold for gradient clipping - optional double gradient_clipping_threshold = 38 [default = 0.0]; + // global threshold for gradient clipping + optional double gradient_clipping_threshold = 38 [ default = 0.0 ]; }; message TrainerConfig { @@ -141,7 +145,7 @@ message TrainerConfig { repeated string config_files = 5; // the directory to save/load model files for each training path - optional string save_dir = 6 [default = "./output/model"]; + optional string save_dir = 6 [ default = "./output/model" ]; // Path of the initial model parameters. // If it was set, start_pass will be ignored. @@ -149,7 +153,7 @@ message TrainerConfig { // Start training from this pass. // Will load parameter from the previous pass. - optional int32 start_pass = 8 [default = 0]; + optional int32 start_pass = 8 [ default = 0 ]; // file path to the trainer config file optional string config_file = 9; diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 0171f9d8ccd6045cb876d57684269a2a49e77f96..b5030da8e75eb94e857ae4effc6adb6d19dc0e93 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -39,7 +39,7 @@ add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) add_custom_target(paddle_python ALL DEPENDS - ${OUTPUT_DIR}/.timestamp) + ${OUTPUT_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model python_api_wheel) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index f71fefffb59d4a53dda092ff83a61d9eec4b601f..b7b696ef0c13e1bae2e910e08d1a1ea3e45cd5d5 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2198,6 +2198,20 @@ class RowConvLayer(LayerBase): self.create_input_parameter(0, psize, dims) +@config_layer('clip') +class ClipLayer(LayerBase): + def __init__(self, name, inputs, min, max, **xargs): + super(ClipLayer, self).__init__(name, 'clip', 0, inputs=inputs, **xargs) + config_assert( + len(self.inputs) == 1, + 'ClipLayer must have one and only one input.') + config_assert(min < max, 'min must be less than max.') + input_layer = self.get_input_layer(0) + self.set_layer_size(input_layer.size) + self.config.inputs[0].clip_conf.min = min + self.config.inputs[0].clip_conf.max = max + + # key: cost type # value: cost class g_cost_map = {} @@ -2643,6 +2657,31 @@ class SubSequenceLayer(LayerBase): self.create_bias_parameter(bias, size) +@config_layer('sub_nested_seq') +class SubNestedSequenceLayer(LayerBase): + def __init__(self, name, inputs, selected_indices, bias=False, **xargs): + if isinstance(inputs, list): + assert len(inputs) == 1, ('the first input of sub_nested_seq ' + 'layer is a single nested sequence.') + inputs = inputs[0] + if isinstance(selected_indices, list): + assert len(selected_indices) == 1, ( + 'the second input of ' + 'sub_nested_seq layer is a single layer which is a ' + 'set of selected indices.') + selected_indices = selected_indices[0] + + super(SubNestedSequenceLayer, self).__init__( + name, + 'sub_nested_seq', + 0, + inputs=[inputs, selected_indices], + **xargs) + input_layer0 = self.get_input_layer(0) + size = input_layer0.size + self.set_layer_size(size) + + @config_layer('out_prod') class OuterProdLayer(LayerBase): def __init__(self, name, inputs, device=None): @@ -2754,6 +2793,16 @@ class SumToOneNormLayer(LayerBase): self.set_layer_size(input_layer0.size) +@config_layer('row_l2_norm') +class RowL2NormLayer(LayerBase): + def __init__(self, name, inputs, **xargs): + super(RowL2NormLayer, self).__init__( + name, 'row_l2_norm', 0, inputs=inputs, **xargs) + config_assert(len(self.inputs) == 1, 'RowL2NormLayer must have 1 input') + input_layer = self.get_input_layer(0) + self.set_layer_size(input_layer.size) + + @config_layer('cos_vm') class CosSimVecMatLayer(LayerBase): def __init__(self, name, size, inputs, cos_scale=1.0, device=None): @@ -3199,6 +3248,16 @@ class CTCLayer(LayerBase): config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs') +@config_layer('kmax_seq_score') +class KmaxSeqScoreLayer(LayerBase): + def __init__(self, name, inputs, beam_size, **xargs): + super(KmaxSeqScoreLayer, self).__init__( + name, 'kmax_seq_score', 0, inputs=inputs, **xargs) + config_assert( + len(self.inputs) == 1, 'KmaxSeqScoreLayer has only one input.') + self.config.beam_size = beam_size + + @config_layer('warp_ctc') class WarpCTCLayer(LayerBase): def __init__(self, diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 965874ddf632a83d00065c2d40037930a6e604a8..1bc55c869601551aff5fc0311458f906385522d2 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -76,6 +76,7 @@ __all__ = [ 'trans_layer', 'rotate_layer', 'sum_to_one_norm_layer', + 'row_l2_norm_layer', 'get_output_layer', 'LayerType', 'context_projection', @@ -128,7 +129,10 @@ __all__ = [ 'prelu_layer', 'gated_unit_layer', 'crop_layer', + 'sub_nested_seq_layer', + 'clip_layer', 'slice_projection', + 'kmax_sequence_score_layer', ] @@ -160,6 +164,7 @@ class LayerType(object): BATCH_NORM_LAYER = 'batch_norm' NORM_LAYER = 'norm' SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm' + ROW_L2_NORM_LAYER = 'row_l2_norm' ADDTO_LAYER = 'addto' CONCAT_LAYER = 'concat' @@ -221,6 +226,10 @@ class LayerType(object): PRELU = 'prelu' CROP_LAYER = 'crop' + SUB_NESTED_SEQ = 'sub_nested_seq' + CLIP_LAYER = 'clip' + + KMAX_SEQ_SCORE = 'kmax_seq_score' @staticmethod def is_layer_type(type_name): @@ -2889,6 +2898,42 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None): name, LayerType.SUM_TO_ONE_NORM_LAYER, parents=[input], size=input.size) +@wrap_name_default() +@layer_support() +def row_l2_norm_layer(input, name=None, layer_attr=None): + """ + A layer for L2-normalization in each row. + + .. math:: + out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}} + + where the size of :math:`in` is (batchSize x dataDim) , + and the size of :math:`out` is a (batchSize x dataDim) . + + The example usage is: + + .. code-block:: python + + row_l2_norm_layer = row_l2_norm_layer(input=layer) + + :param input: Input layer. + :type input: LayerOutput + :param name: Layer name. + :type name: basestring + :param layer_attr: extra layer attributes. + :type layer_attr: ExtraLayerAttribute. + :return: LayerOutput object. + :rtype: LayerOutput + """ + Layer( + name=name, + type=LayerType.ROW_L2_NORM_LAYER, + inputs=[input.name], + **ExtraAttr.to_kwargs(layer_attr)) + return LayerOutput( + name, LayerType.ROW_L2_NORM_LAYER, parents=[input], size=input.size) + + @wrap_name_default("addto") @wrap_act_default(act=LinearActivation()) @wrap_bias_attr_default(has_bias=False) @@ -6046,3 +6091,122 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None): layer_type=LayerType.CROP_LAYER, parents=input, size=l.config.size) + + +@wrap_name_default() +@layer_support() +def sub_nested_seq_layer(input, selected_indices, name=None): + """ + The sub_nested_seq_layer accepts two inputs: the first one is a nested + sequence; the second one is a set of selceted indices in the nested sequence. + + Then sub_nest_seq_layer trims the first nested sequence input according + to the selected indices to form a new output. This layer is useful in + beam training. + + The example usage is: + + .. code-block:: python + + sub_nest_seq = sub_nested_seq_layer(input=[data, selected_indices]) + + + :param input: A nested sequence. + :type input: LayerOutput + :param selected_indices: a set of sequence indices in the nested sequence. + :type input: LayerOutput + :param name: name of this layer. + :type name: basestring + :return: LayerOutput object. + :rtype: LayerOutput + """ + + assert isinstance(input, LayerOutput), ( + 'The first input of ' + 'sub_nested_seq_layer must be a Paddle layer.') + assert isinstance(selected_indices, LayerOutput), ( + 'The second input of ' + 'sub_nested_seq_layer must be a Paddle layer.') + + l = Layer( + inputs=input.name, + selected_indices=selected_indices.name, + name=name, + type=LayerType.SUB_NESTED_SEQ) + return LayerOutput( + name=name, + layer_type=LayerType.SUB_NESTED_SEQ, + parents=input, + size=l.config.size) + + +@wrap_name_default("clip") +def clip_layer(input, min, max, name=None): + """ + A layer for clipping the input value by the threshold. + + .. math:: + + out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right) + + .. code-block:: python + + clip = clip_layer(input=input_layer, min=-10, max=10) + + :param name: The Layer Name. + :type name: basestring + :param input: The input layer. + :type input: LayerOutput. + :param min: The lower threshold for clipping. + :type min: double + :param max: The upper threshold for clipping. + :type max: double + :return: LayerOutput object. + :rtype: LayerOutput + """ + Layer( + name=name, + type=LayerType.CLIP_LAYER, + inputs=[input.name], + min=min, + max=max) + return LayerOutput( + name, LayerType.CLIP_LAYER, parents=[input], size=input.size) + + +@wrap_name_default() +@layer_support() +def kmax_sequence_score_layer(input, name=None, beam_size=1): + """ + This layer accepts one input which are scores over a sequence or a nested + sequence, and returns indices of beam_size sequences with highest scores. + + .. code-block:: python + + kmax_indices = kmax_sequence_score_layer(input=input_layer, beam_size) + + + :param name: The Layer Name. + :type name: basestring + :param input: The input layer. It stores scores over a sequence or a nested + sequence and its size must be 1. + :type input: LayerOutput. + :param beam_size: squence indices with top beam_size scores are returned. + :type beam_size: double + :return: LayerOutput object. + :rtype: LayerOutput + """ + assert isinstance(input, LayerOutput), ("kmax_sequence_score_layer " + "accepts only one input.") + assert input.size == 1, ( + "input of kmax_sequence_score_layer is a score" + "over a sequence or a nested sequence, so its width must be 1.") + + Layer( + name=name, + type=LayerType.KMAX_SEQ_SCORE, + inputs=[input.name], + beam_size=beam_size) + + return LayerOutput( + name, LayerType.KMAX_SEQ_SCORE, parents=[input], size=input.size) diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh index cdf9b2eab733adb173cf33cd6a93ef7b5abefc50..a61beb871ad064c617fa141451afcb2a5ac64854 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh @@ -7,6 +7,7 @@ test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer -test_recursive_topology test_gated_unit_layer) +test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer +test_kmax_seq_socre_layer test_seq_select_layers) export whole_configs=(test_split_datasource) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr new file mode 100644 index 0000000000000000000000000000000000000000..4b9578a0c050ef74f186485fec3f6c1f7a0f0814 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr @@ -0,0 +1,31 @@ +type: "nn" +layers { + name: "input" + type: "data" + size: 300 + active_type: "" +} +layers { + name: "__clip_0__" + type: "clip" + size: 300 + active_type: "" + inputs { + input_layer_name: "input" + clip_conf { + min: -10 + max: 10 + } + } +} +input_layer_names: "input" +output_layer_names: "__clip_0__" +sub_models { + name: "root" + layer_names: "input" + layer_names: "__clip_0__" + input_layer_names: "input" + output_layer_names: "__clip_0__" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr new file mode 100644 index 0000000000000000000000000000000000000000..81bd71f68eb3f2c04ccd46ee3b77a07543395c60 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr @@ -0,0 +1,66 @@ +type: "nn" +layers { + name: "input" + type: "data" + size: 300 + active_type: "" +} +layers { + name: "data" + type: "data" + size: 128 + active_type: "" +} +layers { + name: "__fc_layer_0__" + type: "fc" + size: 1 + active_type: "exponential" + inputs { + input_layer_name: "data" + input_parameter_name: "___fc_layer_0__.w0" + } + bias_parameter_name: "___fc_layer_0__.wbias" +} +layers { + name: "__kmax_sequence_score_layer_0__" + type: "kmax_seq_score" + active_type: "" + inputs { + input_layer_name: "__fc_layer_0__" + } + beam_size: 5 +} +parameters { + name: "___fc_layer_0__.w0" + size: 128 + initial_mean: 0.0 + initial_std: 0.0883883476483 + dims: 128 + dims: 1 + initial_strategy: 0 + initial_smart: true +} +parameters { + name: "___fc_layer_0__.wbias" + size: 1 + initial_mean: 0.0 + initial_std: 0.0 + dims: 1 + dims: 1 + initial_strategy: 0 + initial_smart: false +} +input_layer_names: "data" +output_layer_names: "__kmax_sequence_score_layer_0__" +sub_models { + name: "root" + layer_names: "input" + layer_names: "data" + layer_names: "__fc_layer_0__" + layer_names: "__kmax_sequence_score_layer_0__" + input_layer_names: "data" + output_layer_names: "__kmax_sequence_score_layer_0__" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr new file mode 100644 index 0000000000000000000000000000000000000000..c2786ff55c7023d856d739face5e747cc5fee870 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr @@ -0,0 +1,27 @@ +type: "nn" +layers { + name: "input" + type: "data" + size: 300 + active_type: "" +} +layers { + name: "__row_l2_norm_layer_0__" + type: "row_l2_norm" + size: 300 + active_type: "" + inputs { + input_layer_name: "input" + } +} +input_layer_names: "input" +output_layer_names: "__row_l2_norm_layer_0__" +sub_models { + name: "root" + layer_names: "input" + layer_names: "__row_l2_norm_layer_0__" + input_layer_names: "input" + output_layer_names: "__row_l2_norm_layer_0__" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr new file mode 100644 index 0000000000000000000000000000000000000000..4b906b113e3c0569d5576127e100d097e4923436 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr @@ -0,0 +1,37 @@ +type: "nn" +layers { + name: "input_seq" + type: "data" + size: 300 + active_type: "" +} +layers { + name: "input" + type: "data" + size: 5 + active_type: "" +} +layers { + name: "__sub_nested_seq_layer_0__" + type: "sub_nested_seq" + size: 300 + active_type: "" + inputs { + input_layer_name: "input_seq" + } + inputs { + input_layer_name: "input" + } +} +input_layer_names: "input_seq" +output_layer_names: "__sub_nested_seq_layer_0__" +sub_models { + name: "root" + layer_names: "input_seq" + layer_names: "input" + layer_names: "__sub_nested_seq_layer_0__" + input_layer_names: "input_seq" + output_layer_names: "__sub_nested_seq_layer_0__" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..f066fe1fb30877bf40bb6299d35546f7427989a5 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py @@ -0,0 +1,6 @@ +from paddle.trainer_config_helpers import * + +data = data_layer(name='input', size=300) +clip = clip_layer(input=data, min=-10, max=10) + +outputs(clip) diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..d245c5a41c793e1f02f306bfe64071bd9885906e --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +#coding=utf-8 +from paddle.trainer_config_helpers import * + +data = data_layer(name='input', size=300) + +data = data_layer(name="data", size=128) +scores = fc_layer(input=data, size=1, act=ExpActivation()) +kmax_seq_id = kmax_sequence_score_layer(input=scores, beam_size=5) + +outputs(kmax_seq_id) diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..ac8badb26a40e96e75225e6f61aa536cd28e9098 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py @@ -0,0 +1,6 @@ +from paddle.trainer_config_helpers import * + +data = data_layer(name='input', size=300) +row_l2_norm = row_l2_norm_layer(input=data) + +outputs(row_l2_norm) diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py new file mode 100644 index 0000000000000000000000000000000000000000..6d1c3175ba9801d69f3f9cb9e754858253192270 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +#coding=utf-8 +from paddle.trainer_config_helpers import * + +beam_size = 5 + +data = data_layer(name='input_seq', size=300) +selected_ids = data_layer(name='input', size=beam_size) +sub_nest_seq = sub_nested_seq_layer(input=data, selected_indices=selected_ids) + +outputs(sub_nest_seq) diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index f885b2834e8ad502b752c6fd53daf7ef1693433f..0a2a1ced11ee5cb2fb407b229ce810d553c2fa46 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -133,7 +133,7 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train100(), 10, "cifar_train100") - paddle.v2.dataset.common.convert(path, test100(), 10, "cifar_test100") - paddle.v2.dataset.common.convert(path, train10(), 10, "cifar_train10") - paddle.v2.dataset.common.convert(path, test10(), 10, "cifar_test10") + paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100") + paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100") + paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10") + paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10") diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index 111496618dfa997246d0a067b0cd4c7dad74f9dc..053ae151c571e5557c9f2f9f4ec866f546a77797 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -32,17 +32,22 @@ __all__ = [ DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset') + # When running unit tests, there could be multiple processes that # trying to create DATA_HOME directory simultaneously, so we cannot # use a if condition to check for the existence of the directory; # instead, we use the filesystem as the synchronization mechanism by # catching returned errors. -try: - os.makedirs(DATA_HOME) -except OSError as exc: - if exc.errno != errno.EEXIST: - raise - pass +def must_mkdirs(path): + try: + os.makedirs(DATA_HOME) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise + pass + + +must_mkdirs(DATA_HOME) def md5file(fname): @@ -93,6 +98,19 @@ def fetch_all(): "fetch")() +def fetch_all_recordio(path): + for module_name in filter(lambda x: not x.startswith("__"), + dir(paddle.v2.dataset)): + if "convert" in dir( + importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \ + not module_name == "common": + ds_path = os.path.join(path, module_name) + must_mkdirs(ds_path) + getattr( + importlib.import_module("paddle.v2.dataset.%s" % module_name), + "convert")(ds_path) + + def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump): """ you can call the function as: diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index f8aae52e7c29d86c7da9c1da0dd1d093634d4567..23f5a24a1cea7f665fb65e802e1a7811df78208d 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -233,5 +233,5 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, test(), 10, "conl105_train") - paddle.v2.dataset.common.convert(path, test(), 10, "conl105_test") + paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test") diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index c0ec5992e0e6b0a2fd2359910d0f7a6c690c2ec3..93dd3e8f7d3a569eaf56335f0f92bed04c0ee26c 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -173,5 +173,5 @@ def convert(path): Converts dataset to recordio format """ w = word_dict() - paddle.v2.dataset.common.convert(path, lambda: train(w), 10, "imdb_train") - paddle.v2.dataset.common.convert(path, lambda: test(w), 10, "imdb_test") + paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train") + paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test") diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index b18ee8e9ba91e0e8ccf061223b3c0d4636442956..617c722c4165cdfed9e650fc968d623ef6ed4391 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -155,6 +155,7 @@ def convert(path): N = 5 word_dict = build_dict() paddle.v2.dataset.common.convert(path, - train(word_dict, N), 10, "imikolov_train") + train(word_dict, N), 1000, + "imikolov_train") paddle.v2.dataset.common.convert(path, - test(word_dict, N), 10, "imikolov_test") + test(word_dict, N), 1000, "imikolov_test") diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py index ea5891f4f3f6ee1c5023cccee9732cbd9d78b881..9f675bed895223e054cd3bb6e504fe1607f19858 100644 --- a/python/paddle/v2/dataset/mnist.py +++ b/python/paddle/v2/dataset/mnist.py @@ -119,5 +119,5 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train(), 10, "minist_train") - paddle.v2.dataset.common.convert(path, test(), 10, "minist_test") + paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test") diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index d9372d422a3293eddeb7c0d5b7c8980f55c44690..5b61a9420af1bb81e1d826f8a7b69f34c306d382 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -254,8 +254,8 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train(), 10, "movielens_train") - paddle.v2.dataset.common.convert(path, test(), 10, "movielens_test") + paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test") if __name__ == '__main__': diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py index e33f120c8734621fd60497298d993e6e43bd06e0..b0b9757c1a75d215cf8945b5cedbb1239fd43af7 100644 --- a/python/paddle/v2/dataset/sentiment.py +++ b/python/paddle/v2/dataset/sentiment.py @@ -137,5 +137,5 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train, 10, "sentiment_train") - paddle.v2.dataset.common.convert(path, test, 10, "sentiment_test") + paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train") + paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test") diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py index ec10ce646ebf3eca2c2a6423b69ee11b6a2b99cf..ce60aa21c2ad1fb8f089d19d548b59a8c806d1ee 100644 --- a/python/paddle/v2/dataset/uci_housing.py +++ b/python/paddle/v2/dataset/uci_housing.py @@ -119,5 +119,5 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train(), 10, "uci_housing_train") - paddle.v2.dataset.common.convert(path, test(), 10, "uci_houseing_test") + paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test") diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index 2a631c365f27a6039021a56268a62017638c2739..95a35d97ce9d9503153974cc167ee60829244d5f 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -169,5 +169,6 @@ def convert(path): Converts dataset to recordio format """ dict_size = 30000 - paddle.v2.dataset.common.convert(path, train(dict_size), 10, "wmt14_train") - paddle.v2.dataset.common.convert(path, test(dict_size), 10, "wmt14_test") + paddle.v2.dataset.common.convert(path, + train(dict_size), 1000, "wmt14_train") + paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test") diff --git a/python/paddle/v2/framework/network.py b/python/paddle/v2/framework/network.py deleted file mode 100644 index cfeb0e3dec0fd2c6ad4d2d2501f97932495fdd41..0000000000000000000000000000000000000000 --- a/python/paddle/v2/framework/network.py +++ /dev/null @@ -1,131 +0,0 @@ -import paddle.v2.framework.core as core -from paddle.v2.framework.create_op_creation_methods import op_creations -from default_scope_funcs import new_var, find_var, get_cur_scope - -__all__ = ['Network'] # Only expose Network - - -class NetworkFunctor(object): - """ - Network Op Creation Function. Used internally in this module. - It convert string input to Variable. If it is not created before, just - create in scope. - - It is a functor object. means the instances are callable. - - :param func: The op creation function which generated in Python. - :param net: The Network instance. - """ - - def __init__(self, func, net): - self.func = func - self.net = net - - def __call__(self, *args, **kwargs): - if len(args) != 0: - raise ValueError("Paddle must use keyword argument") - inputs = self.func.all_input_args - for ipt in inputs: - if ipt in kwargs: - var = kwargs[ipt] - if isinstance(var, basestring): - tmp = new_var(var) - self.net.var_names[tmp] = var - var = tmp - - if not isinstance(var, core.Variable): - raise TypeError( - "Input of op creation must be string or variable") - - kwargs[ipt] = self.net.var_names[var] - - notemp_outputs = self.func.all_not_temp_output_args - - for name in notemp_outputs: - if name not in kwargs: - kwargs[ - name] = self.func.__name__ + "@OUT@%d" % core.unique_integer( - ) - - outputs = self.func.all_output_args - for opt in outputs: - if opt in kwargs: - var = kwargs[opt] - if isinstance(var, basestring): - tmp = new_var(var) - self.net.var_names[tmp] = var - var = tmp - - if not isinstance(var, core.Variable): - raise TypeError( - "Output of op creation must be string or variable") - kwargs[opt] = self.net.var_names[var] - - op = self.func(**kwargs) - - self.net.net.add_op(op) - - lst = [find_var(kwargs[opt]) for opt in notemp_outputs] - if len(lst) == 1: - return lst[0] - elif len(lst) == 0: - return None - else: - return lst - - -class Network(object): - """ - The network concept. It avoid user to manually create operator, create - variable, and combine them into a Net. Just use Network.xxx can create the - operator, create variables in default scope, and add them into `self.net`. - - For example: - - .. code-block: python - - net = Network() - out = net.add_two(X="a", Y="b") - fc_out = net.fc(X="out", W="fc.w") - - net.run(...) - """ - - def __init__(self): - self.net = core.Net.create() - funcs = (func_name for func_name in dir(op_creations) - if not func_name.startswith("__")) - self.var_names = dict() - - # TODO(yuyang18): This code can work, but do not generate a good - # docstring, try to give a better way generate function in runtime - # later. - for func_name in funcs: - func = getattr(op_creations, func_name) - impl = NetworkFunctor(func, self) - setattr(self, func_name, impl.__call__) - self.__complete_add_op__ = False - - def infer_shape(self): - self.complete_add_op() - self.net.infer_shape(get_cur_scope()) - - def run(self, device_context): - self.complete_add_op() - self.net.run(get_cur_scope(), device_context) - - def __str__(self): - return str(self.net) - - def complete_add_op(self): - if not self.__complete_add_op__: - self.net.complete_add_op() - self.__complete_add_op__ = True - - -if __name__ == '__main__': - net = Network() - out = net.add_two(X="a", Y="b") - fc_out = net.fc(X=out, W="fc.w", b="fc.b", activation="softmax") - net.complete_add_op() - print net diff --git a/python/paddle/v2/framework/create_op_creation_methods.py b/python/paddle/v2/framework/op.py similarity index 60% rename from python/paddle/v2/framework/create_op_creation_methods.py rename to python/paddle/v2/framework/op.py index b034efffb69030cb09e09ea545e9bff6f1744671..7fd8b55a5d167294d3270c79f7b64da03443afd3 100644 --- a/python/paddle/v2/framework/create_op_creation_methods.py +++ b/python/paddle/v2/framework/op.py @@ -1,8 +1,7 @@ import paddle.v2.framework.core as core import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2 import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2 -import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2 -import cStringIO +import paddle.v2.framework.proto.attribute_pb2 as attribute_pb2 def get_all_op_protos(): @@ -57,7 +56,7 @@ class OpDescCreationMethod(object): op_desc.attrs.extend([out_format]) if len(tmp_index) != 0: tmp_index_attr = op_desc.attrs.add() - tmp_index_attr.type = attr_type_pb2.INTS + tmp_index_attr.type = attribute_pb2.INTS tmp_index_attr.name = "temporary_index" tmp_index_attr.ints.extend(tmp_index) @@ -73,17 +72,17 @@ class OpDescCreationMethod(object): new_attr = op_desc.attrs.add() new_attr.name = attr.name new_attr.type = attr.type - if attr.type == attr_type_pb2.INT: + if attr.type == attribute_pb2.INT: new_attr.i = user_defined_attr - elif attr.type == attr_type_pb2.FLOAT: + elif attr.type == attribute_pb2.FLOAT: new_attr.f = user_defined_attr - elif attr.type == attr_type_pb2.STRING: + elif attr.type == attribute_pb2.STRING: new_attr.s = user_defined_attr - elif attr.type == attr_type_pb2.INTS: + elif attr.type == attribute_pb2.INTS: new_attr.ints.extend(user_defined_attr) - elif attr.type == attr_type_pb2.FLOATS: + elif attr.type == attribute_pb2.FLOATS: new_attr.floats.extend(user_defined_attr) - elif attr.type == attr_type_pb2.STRINGS: + elif attr.type == attribute_pb2.STRINGS: new_attr.strings.extend(user_defined_attr) else: raise NotImplementedError("Not support attribute type " + @@ -109,7 +108,7 @@ class OpDescCreationMethod(object): retv = [] if multiple: var_format = op_desc_pb2.AttrDesc() - var_format.type = attr_type_pb2.INTS + var_format.type = attribute_pb2.INTS var_format.name = "%s_format" % in_out var_format.ints.append(0) @@ -146,64 +145,14 @@ class OpDescCreationMethod(object): return False -def get_docstring_from_op_proto(op_proto): - """ - Generate docstring from a OpProto - :param op_proto: a OpProto instance. - :type op_proto: op_proto_pb2.OpProto - :return: docstring - """ - if not isinstance(op_proto, op_proto_pb2.OpProto): - raise TypeError("Input must be OpProto") - f = cStringIO.StringIO() - f.write(op_proto.comment) - f.write("\n") - - def __append_param__(name, comment, type): - # Maybe replace the following line with template engine is better. - f.write(":param ") - f.write(name) - f.write(": ") - f.write(comment) - f.write("\n") - f.write(":type ") - f.write(name) - f.write(": ") - f.write(type) - f.write("\n") - - for ipt in op_proto.inputs: - __append_param__(ipt.name, ipt.comment, "list | basestr" - if ipt.multiple else "basestr") - - temp_var_prefix = \ - "This is a temporary variable. It does not have to set by user. " - for opt in op_proto.outputs: - __append_param__(opt.name, opt.comment if not opt.temporary else - temp_var_prefix + opt.comment, "list | basestr" - if opt.multiple else "basestr") - - for attr in op_proto.attrs: - attr_type = None - if attr.type == attr_type_pb2.INT: - attr_type = "int" - elif attr.type == attr_type_pb2.FLOAT: - attr_type = "float" - elif attr.type == attr_type_pb2.STRING: - attr_type = "basestr" - elif attr.type == attr_type_pb2.INTS: - attr_type = "list of int" - elif attr.type == attr_type_pb2.FLOATS: - attr_type = "list of float" - elif attr.type == attr_type_pb2.STRINGS: - attr_type = "list of basestr" - - if attr_type is None: - raise RuntimeError("Not supported attribute type " + attr.type) - - __append_param__(attr.name, attr.comment, attr_type) - - return f.getvalue() +class OpInfo(object): + def __init__(self, name, method, inputs, outputs, attrs, no_temp_outputs): + self.name = name + self.method = method + self.inputs = inputs + self.outputs = outputs + self.attrs = attrs + self.no_temp_outputs = no_temp_outputs def create_op_creation_method(op_proto): @@ -216,38 +165,57 @@ def create_op_creation_method(op_proto): opdesc = method(*args, **kwargs) return core.Operator.create(opdesc.SerializeToString()) - __impl__.__doc__ = get_docstring_from_op_proto(op_proto) - __impl__.all_input_args = [var.name for var in op_proto.inputs] - __impl__.all_output_args = [var.name for var in op_proto.outputs] - __impl__.all_attr_args = [attr.name for attr in op_proto.attrs] - __impl__.all_not_temp_output_args = [ - var.name for var in op_proto.outputs if not var.temporary - ] + return OpInfo( + method=__impl__, + name=op_proto.type, + inputs=[var.name for var in op_proto.inputs], + outputs=[var.name for var in op_proto.outputs], + attrs=[attr.name for attr in op_proto.attrs], + no_temp_outputs=[ + var.name for var in op_proto.outputs if not var.temporary + ]) - return __impl__ +class OperatorFactory(object): + def __init__(self): + self.op_methods = dict() + for op_proto in get_all_op_protos(): + method = create_op_creation_method(op_proto) + self.op_methods[method.name] = method -class OpCreationsHolder(object): - """ - A object will holds all op creation methods. - - Use `op_creations.xxx_op` to access them. - """ - pass + def __call__(self, *args, **kwargs): + if 'type' in kwargs: + if len(args) != 0: + raise ValueError("All Paddle argument should be key-word " + "argument except type") + t = kwargs.pop('type') + else: + if len(args) != 1: + raise ValueError("All Paddle argument should be key-word " + "argument except type") + t = args[0] + return self.get_op_info(t).method(**kwargs) -op_creations = OpCreationsHolder() + def types(self): + return self.op_methods.keys() + def get_op_info(self, t): + if t not in self.op_methods: + raise ValueError("operator %s is not registered", t) + return self.op_methods.get(t) -def __bootstrap__(): - """ - Bootstrap function for this module. It will dynamic create all op creation - methods in runtime. - """ - for op_proto in get_all_op_protos(): - func = create_op_creation_method(op_proto) - func.__name__ = str(op_proto.type) - setattr(op_creations, func.__name__, func) + def get_op_input_names(self, type): + return self.get_op_info(type).inputs + + def get_op_output_names(self, type): + return self.get_op_info(type).outputs + + def get_op_attr_names(self, type): + return self.get_op_info(type).attrs + + def get_op_no_temp_output_names(self, type): + return self.get_op_info(type).no_temp_outputs -__bootstrap__() +Operator = OperatorFactory() # Default global factory diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt index 540636a0e8100fbf97231bd548dbc1176b07daca..10659caa882fd3d4060f9947413a392c3b681ee8 100644 --- a/python/paddle/v2/framework/tests/CMakeLists.txt +++ b/python/paddle/v2/framework/tests/CMakeLists.txt @@ -1,17 +1,25 @@ -add_python_test(test_framework - test_protobuf.py - test_scope.py - test_default_scope_funcs.py - test_op_creation_methods.py - test_net.py - test_tensor.py - test_fc_op.py - test_add_two_op.py - test_sgd_op.py - test_cross_entropy_op.py - test_mul_op.py - test_mean_op.py - test_sigmoid_op.py - test_softmax_op.py - test_rowwise_add_op.py - test_network.py) +py_test(test_net SRCS test_net.py) + +py_test(test_fc_op SRCS test_fc_op.py) +py_test(test_scope SRCS test_scope.py) + +py_test(test_tensor SRCS test_tensor.py) +py_test(test_mul_op SRCS test_mul_op.py) + +py_test(test_mean_op SRCS test_mean_op.py) + +py_test(test_protobuf SRCS test_protobuf.py) + +py_test(test_add_two_op SRCS test_add_two_op.py) +py_test(test_sigmoid_op SRCS test_sigmoid_op.py) +py_test(test_softmax_op SRCS test_softmax_op.py) +py_test(test_cross_entropy_op SRCS test_cross_entropy_op.py) +py_test(test_fill_zeros_like_op SRCS test_fill_zeros_like_op.py) + +py_test(gradient_checker SRCS gradient_checker.py) + +py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py) + +py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py) +py_test(test_operator SRCS test_operator.py) +py_test(test_uniform_random_op SRCS test_uniform_random_op.py) diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..b73c4869d14a62a951d8e45dafb14b7523355519 --- /dev/null +++ b/python/paddle/v2/framework/tests/gradient_checker.py @@ -0,0 +1,236 @@ +import unittest + +import numpy +import paddle.v2.framework.core as core +from paddle.v2.framework.op import Operator + +__all__ = ['get_numeric_gradient'] + + +def create_op(op_type): + kwargs = dict() + for in_name in Operator.get_op_input_names(op_type): + kwargs[in_name] = in_name + for out_name in Operator.get_op_output_names(op_type): + kwargs[out_name] = out_name + + return Operator(op_type, **kwargs) + + +def grad_var_name(var_name): + return var_name + "@GRAD" + + +def get_numeric_gradient(op, + input_values, + output_name, + input_to_check, + delta=0.005, + local_scope=None): + """ + Get Numeric Gradient for an operator's input. + + :param op: C++ operator instance, could be an network + :param input_values: The input variables. Should be an dictionary, key is + variable name. Value is numpy array. + :param output_name: The final output variable name. + :param input_to_check: The input variable need to get gradient. + :param delta: The perturbation value for numeric gradient method. The + smaller delta is, the more accurate result will get. But if that delta is + too small, it could occur numerical stability problem. + :param local_scope: The local scope used for get_numeric_gradient. + :return: The gradient array in numpy format. + """ + if local_scope is None: + local_scope = core.Scope() + + # Create all input variable in local_scope + for var_name in input_values: + var = local_scope.new_var(var_name) + tensor = var.get_tensor() + tensor.set_dims(input_values[var_name].shape) + tensor.alloc_float(core.CPUPlace()) + tensor.set(input_values[var_name], core.CPUPlace()) + + # Create all output variable in local_scope + for output in op.outputs(): + if local_scope.find_var(output) is None: + local_scope.new_var(output).get_tensor() + + op.infer_shape(local_scope) + + # allocate output memory + for output in op.outputs(): + local_scope.find_var(output).get_tensor().alloc_float(core.CPUPlace()) + + # TODO(yuyang18): Only CPU is support now. + cpu_ctx = core.DeviceContext.create(core.CPUPlace()) + + def get_output(): + op.run(local_scope, cpu_ctx) + return numpy.array(local_scope.find_var(output_name).get_tensor()).sum() + + def product(dim): + return reduce(lambda a, b: a * b, dim, 1) + + tensor_to_check = local_scope.find_var(input_to_check).get_tensor() + tensor_size = product(tensor_to_check.get_dims()) + gradient_flat = numpy.zeros(shape=(tensor_size, ), dtype='float32') + for i in xrange(tensor_size): + origin = tensor_to_check.get_float_element(i) + x_pos = origin + delta + tensor_to_check.set_float_element(i, x_pos) + y_pos = get_output() + + x_neg = origin - delta + tensor_to_check.set_float_element(i, x_neg) + y_neg = get_output() + + tensor_to_check.set_float_element(i, origin) # restore old value + gradient_flat[i] = (y_pos - y_neg) / delta / 2 + return gradient_flat.reshape(tensor_to_check.get_dims()) + + +class GradientChecker(unittest.TestCase): + def __is_close(self, numeric_grads, scope, max_relative_error): + for name in numeric_grads: + op_grad = numpy.array( + scope.find_var(grad_var_name(name)).get_tensor()) + is_close = numpy.allclose( + numeric_grads[name], op_grad, rtol=max_relative_error, atol=100) + if not is_close: + return False + return True + + def check_grad(self, + forward_op, + input_vars, + inputs_to_check, + output_name, + no_grad_set=None, + only_cpu=False, + max_relative_error=0.005): + """ + :param forward_op: used to create backward_op + :param input_vars: numpy value of input variable. The following + computation will use these variables. + :param inputs_to_check: inputs var names that should check gradient. + :param output_name: output name that used to + :param max_relative_error: The relative tolerance parameter. + :param no_grad_set: used when create backward ops + :param only_cpu: only compute and check gradient on cpu kernel. + :return: + """ + if no_grad_set is None: + no_grad_set = set() + + tmp_outs = forward_op.temp_outputs() + no_tmp_out = filter(lambda name: name not in tmp_outs, + forward_op.outputs()) + if len(no_tmp_out) != 1: + raise ValueError("non temp out_names should be 1") + + in_names = forward_op.inputs() + for no_grad in no_grad_set: + if no_grad not in in_names: + raise ValueError("no_grad should be in in_names") + + backward_op = core.Operator.backward(forward_op, no_grad_set) + + places = [core.CPUPlace()] + if not only_cpu and core.is_compile_gpu() and backward_op.support_gpu(): + places.append(core.GPUPlace(0)) + + numeric_grad = dict() + # get numeric gradient + for check_name in inputs_to_check: + numeric_grad[check_name] = \ + get_numeric_gradient(forward_op, input_vars, output_name, check_name) + + # get operator gradient according to different device + for place in places: + scope = core.Scope() + ctx = core.DeviceContext.create(place) + + # create input var and set value + for name, value in input_vars.iteritems(): + if name not in in_names: + raise ValueError(name + " not in op.inputs_") + var = scope.new_var(name).get_tensor() + var.set_dims(value.shape) + var.set(value, place) + + # create output var + for out_name in forward_op.outputs(): + scope.new_var(out_name).get_tensor() + + # infer the shape of output var and compute/set value of output var + forward_op.infer_shape(scope) + forward_op.run(scope, ctx) + + # create output grad var + # set shape as the output var + # set value of this grad to ones + for name in forward_op.outputs(): + out_tensor = scope.find_var(name).get_tensor() + grad_tensor = scope.new_var(grad_var_name(name)).get_tensor() + grad_tensor.set_dims(out_tensor.shape()) + data = 1.0 * numpy.ones(out_tensor.shape()) + grad_tensor.set(data, place) + + # create input grad var + for name in backward_op.outputs(): + scope.new_var(name).get_tensor() + + # infer the shape of input gradient var and compute/set it's value + # with backward op + backward_op.infer_shape(scope) + backward_op.run(scope, ctx) + + if isinstance(place, core.CPUPlace): + msg = "CPU kernel gradient is not close to numeric gradient" + else: + if isinstance(place, core.GPUPlace): + msg = "GPU kernel gradient is not close to numeric gradient" + else: + raise ValueError("unknown place " + type(place)) + self.assertTrue( + self.__is_close(numeric_grad, scope, max_relative_error), msg) + + +if __name__ == '__main__': + + class GetNumericGradientTest(unittest.TestCase): + def test_add_op(self): + add_op = Operator('add_two', X="X", Y="Y", Out="Z") + x = numpy.random.random((10, 1)).astype("float32") + y = numpy.random.random((10, 1)).astype("float32") + + arr = get_numeric_gradient(add_op, {'X': x, "Y": y}, 'Z', 'X') + self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-2) + + def test_softmax_op(self): + def stable_softmax(x): + """Compute the softmax of vector x in a numerically stable way.""" + shiftx = x - numpy.max(x) + exps = numpy.exp(shiftx) + return exps / numpy.sum(exps) + + def label_softmax_grad(Y, dY): + dX = Y * 0.0 + for i in range(Y.shape[0]): + d = numpy.dot(Y[i, :], dY[i, :]) + dX[i, :] = Y[i, :] * (dY[i, :] - d) + return dX + + softmax_op = Operator("softmax", X="X", Y="Y") + + X = numpy.random.random((2, 2)).astype("float32") + Y = numpy.apply_along_axis(stable_softmax, 1, X) + dY = numpy.ones(Y.shape) + dX = label_softmax_grad(Y, dY) + + arr = get_numeric_gradient(softmax_op, {"X": X}, 'Y', 'X') + numpy.testing.assert_almost_equal(arr, dX, decimal=1e-2) + + unittest.main() diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py index 99085c367221150c8386a24e8d90d58fd63894c4..dd65e0f2dc23d3f657ff16c55fb297dae210b2d7 100644 --- a/python/paddle/v2/framework/tests/op_test_util.py +++ b/python/paddle/v2/framework/tests/op_test_util.py @@ -1,7 +1,6 @@ -import paddle.v2.framework.core as core -import unittest import numpy -import paddle.v2.framework.create_op_creation_methods as creation +import paddle.v2.framework.core as core +from paddle.v2.framework.op import Operator class OpTestMeta(type): @@ -21,45 +20,52 @@ class OpTestMeta(type): obj = super(OpTestMeta, cls).__new__(cls, name, bases, attrs) def test_all(self): - func = getattr(creation.op_creations, self.type, None) - self.assertIsNotNone(func) - scope = core.Scope() kwargs = dict() + places = [core.CPUPlace()] + if core.is_compile_gpu(): + places.append(core.GPUPlace(0)) - for in_name in func.all_input_args: - if hasattr(self, in_name): - kwargs[in_name] = in_name - var = scope.new_var(in_name).get_tensor() - arr = getattr(self, in_name) - var.set_dims(arr.shape) - var.set(arr) - else: - kwargs[in_name] = "@EMPTY@" + for place in places: + for in_name in Operator.get_op_input_names(self.type): + if hasattr(self, "inputs") and in_name in self.inputs: + kwargs[in_name] = in_name + var = scope.new_var(in_name).get_tensor() + arr = self.inputs[in_name] + var.set_dims(arr.shape) + var.set(arr, place) + else: + kwargs[in_name] = "@EMPTY@" - for out_name in func.all_output_args: - if hasattr(self, out_name): + for out_name in Operator.get_op_output_names(self.type): + if not hasattr(self, "outputs"): + raise ValueError( + "The test op must set self.outputs dict.") + if out_name not in self.outputs: + raise ValueError("The %s is not in self.outputs dict." % + (out_name)) kwargs[out_name] = out_name scope.new_var(out_name).get_tensor() - for attr_name in func.all_attr_args: - if hasattr(self, attr_name): - kwargs[attr_name] = getattr(self, attr_name) + for attr_name in Operator.get_op_attr_names(self.type): + if hasattr(self, "attrs") and attr_name in self.attrs: + kwargs[attr_name] = self.attrs[attr_name] - op = func(**kwargs) + op = Operator(self.type, **kwargs) + if isinstance(place, core.GPUPlace) and not op.support_gpu(): + return - op.infer_shape(scope) + op.infer_shape(scope) - ctx = core.DeviceContext.cpu_context() - op.run(scope, ctx) + ctx = core.DeviceContext.create(place) + op.run(scope, ctx) - for out_name in func.all_output_args: - actual = numpy.array(scope.find_var(out_name).get_tensor()) - expect = getattr(self, out_name) - # TODO(qijun) The default decimal is 7, but numpy.dot and eigen.mul - # has some diff, and could not pass unittest. So I set decimal 3 here. - # And I will check this in future. - numpy.testing.assert_almost_equal(actual, expect, decimal=3) + for out_name in Operator.get_op_output_names(self.type): + actual = numpy.array(scope.find_var(out_name).get_tensor()) + expect = self.outputs[out_name] + self.assertTrue( + numpy.allclose(actual, expect), + "output name: " + out_name + "has diff") obj.test_all = test_all return obj diff --git a/python/paddle/v2/framework/tests/test_add_two_op.py b/python/paddle/v2/framework/tests/test_add_two_op.py index a06d7a78ecf838a49e5f2808d3686c6b92faa8ce..c0237830647371e14b755953345965a3eac7bfd2 100644 --- a/python/paddle/v2/framework/tests/test_add_two_op.py +++ b/python/paddle/v2/framework/tests/test_add_two_op.py @@ -1,6 +1,10 @@ import unittest -from op_test_util import OpTestMeta + import numpy +import paddle.v2.framework.core as core +from paddle.v2.framework.op import Operator + +from op_test_util import OpTestMeta class TestAddOp(unittest.TestCase): @@ -8,9 +12,20 @@ class TestAddOp(unittest.TestCase): def setUp(self): self.type = "add_two" - self.X = numpy.random.random((342, 345)).astype("float32") - self.Y = numpy.random.random((342, 345)).astype("float32") - self.Out = self.X + self.Y + self.inputs = { + 'X': numpy.random.random((102, 105)).astype("float32"), + 'Y': numpy.random.random((102, 105)).astype("float32") + } + self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']} + + +class TestAddGradOp(unittest.TestCase): + def test_add_grad(self): + op = Operator('add_two', X="X", Y="Y", Out="Out") + backward_op = core.Operator.backward(op, set()) + self.assertEqual(backward_op.type(), "add_two_grad") + expected = '''Op(add_two_grad), inputs:(X, Y, Out, Out@GRAD), outputs:(X@GRAD, Y@GRAD).''' + self.assertEqual(expected, str(backward_op)) if __name__ == '__main__': diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py index 609c56535ef0365dda728cba334d8b4d96312192..4815192e255c6e0429db3f50918a76a773b30131 100644 --- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py +++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py @@ -1,21 +1,37 @@ import unittest import numpy from op_test_util import OpTestMeta +from gradient_checker import GradientChecker, create_op -class TestSGD(unittest.TestCase): +class TestCrossEntropy(unittest.TestCase): __metaclass__ = OpTestMeta def setUp(self): + # TODO this unit test is not passed self.type = "onehot_cross_entropy" batch_size = 100 class_num = 10 - self.X = numpy.random.random((batch_size, class_num)).astype("float32") - self.label = 5 * numpy.ones(batch_size).astype("int32") + X = numpy.random.random((batch_size, class_num)).astype("float32") + label = 5 * numpy.ones(batch_size).astype("int32") + self.inputs = {'X': X, 'label': label} Y = [] for i in range(0, batch_size): - Y.append(-numpy.log(self.X[i][self.label[i]])) - self.Y = numpy.array(Y).astype("float32") + Y.append(-numpy.log(X[i][label[i]])) + self.outputs = {'Y': numpy.array(Y).astype("float32")} + + +class CrossEntropyGradOpTest(GradientChecker): + def test_softmax_grad(self): + op = create_op("onehot_cross_entropy") + batch_size = 100 + class_num = 10 + inputs = { + "X": numpy.random.uniform( + 0.1, 1.0, [batch_size, class_num]).astype("float32"), + "label": (class_num / 2) * numpy.ones(batch_size).astype("int32") + } + self.check_grad(op, inputs, set("X"), "Y") if __name__ == "__main__": diff --git a/python/paddle/v2/framework/tests/test_fc_op.py b/python/paddle/v2/framework/tests/test_fc_op.py index 43931aac406cd93beede008066aa1c0c00eba6ea..e24435839d305bb1a4ab7daa3e9684a421468fd8 100644 --- a/python/paddle/v2/framework/tests/test_fc_op.py +++ b/python/paddle/v2/framework/tests/test_fc_op.py @@ -1,28 +1,30 @@ import paddle.v2.framework.core as core import unittest import numpy -import paddle.v2.framework.create_op_creation_methods as creation +from paddle.v2.framework.op import Operator class TestFc(unittest.TestCase): def test_fc(self): scope = core.Scope() + place = core.CPUPlace() x = scope.new_var("X") + x_tensor = x.get_tensor() x_tensor.set_dims([1000, 784]) - x_tensor.alloc_float() + x_tensor.alloc_float(place) w = scope.new_var("W") w_tensor = w.get_tensor() w_tensor.set_dims([784, 100]) - w_tensor.alloc_float() + w_tensor.alloc_float(place) - w_tensor.set(numpy.random.random((784, 100)).astype("float32")) + w_tensor.set(numpy.random.random((784, 100)).astype("float32"), place) # Set a real numpy array here. # x_tensor.set(numpy.array([])) - op = creation.op_creations.fc(X="X", Y="Y", W="W") + op = Operator("fc", X="X", Y="Y", W="W") for out in op.outputs(): if scope.find_var(out) is None: @@ -32,7 +34,7 @@ class TestFc(unittest.TestCase): op.infer_shape(scope) self.assertEqual([1000, 100], tensor.shape()) - ctx = core.DeviceContext.cpu_context() + ctx = core.DeviceContext.create(place) op.run(scope, ctx) diff --git a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py new file mode 100644 index 0000000000000000000000000000000000000000..e5c862605fb11a5ea1426cf8f9054589dc377ff1 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py @@ -0,0 +1,16 @@ +import unittest +from op_test_util import OpTestMeta +import numpy + + +class TestFillZerosLikeOp(unittest.TestCase): + __metaclass__ = OpTestMeta + + def setUp(self): + self.type = "fill_zeros_like" + self.inputs = {'Src': numpy.random.random((219, 232)).astype("float32")} + self.outputs = {'Dst': numpy.zeros_like(self.inputs['Src'])} + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_mean_op.py b/python/paddle/v2/framework/tests/test_mean_op.py index 78fff1eeff998109a51ea662f963a102eff49d3a..b5d52b90567bcd0c9f376147145d8638049f7bab 100644 --- a/python/paddle/v2/framework/tests/test_mean_op.py +++ b/python/paddle/v2/framework/tests/test_mean_op.py @@ -8,8 +8,8 @@ class TestMeanOp(unittest.TestCase): def setUp(self): self.type = "mean" - self.X = np.random.random((32, 784)).astype("float32") - self.Out = np.mean(self.X) + self.inputs = {'X': np.random.random((32, 784)).astype("float32")} + self.outputs = {'Out': np.mean(self.inputs['X'])} if __name__ == '__main__': diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py index 0a87e66cd03af1bf84be8ffe111e4a8c3a24d6dc..ec0ac99156a546dd3fb7b27778032bece38ab5a9 100644 --- a/python/paddle/v2/framework/tests/test_mul_op.py +++ b/python/paddle/v2/framework/tests/test_mul_op.py @@ -8,9 +8,11 @@ class TestMulOp(unittest.TestCase): def setUp(self): self.type = "mul" - self.X = np.random.random((32, 784)).astype("float32") - self.Y = np.random.random((784, 100)).astype("float32") - self.Out = np.dot(self.X, self.Y) + self.inputs = { + 'X': np.random.random((32, 84)).astype("float32"), + 'Y': np.random.random((84, 100)).astype("float32") + } + self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} if __name__ == '__main__': diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/framework/tests/test_net.py index db776d6b643dc4014da9f5dded8219180af639e3..b30896553dea4a4929038d524b23c6090bbed380 100644 --- a/python/paddle/v2/framework/tests/test_net.py +++ b/python/paddle/v2/framework/tests/test_net.py @@ -1,16 +1,16 @@ import paddle.v2.framework.core as core -from paddle.v2.framework.create_op_creation_methods import op_creations +from paddle.v2.framework.op import Operator import unittest class TestNet(unittest.TestCase): def test_net_all(self): net = core.Net.create() - op1 = op_creations.add_two(X="X", Y="Y", Out="Out") + op1 = Operator("add_two", X="X", Y="Y", Out="Out") net.add_op(op1) net2 = core.Net.create() - net2.add_op(op_creations.fc(X="X", W="w", Y="fc.out")) + net2.add_op(Operator("fc", X="X", W="w", Y="fc.out")) net2.complete_add_op(True) net.add_op(net2) net.complete_add_op(True) diff --git a/python/paddle/v2/framework/tests/test_network.py b/python/paddle/v2/framework/tests/test_network.py deleted file mode 100644 index 6d53e233e959bd39b558ac97cdca381135505f8d..0000000000000000000000000000000000000000 --- a/python/paddle/v2/framework/tests/test_network.py +++ /dev/null @@ -1,32 +0,0 @@ -from paddle.v2.framework.network import Network -import paddle.v2.framework.core as core -import unittest - - -class TestNet(unittest.TestCase): - def test_net_all(self): - net = Network() - out = net.add_two(X="X", Y="Y") - fc_out = net.fc(X=out, W="w") - net.complete_add_op() - self.assertTrue(isinstance(fc_out, core.Variable)) - self.assertEqual( - '''Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, add_two@OUT@0, fc@OUT@1). - Op(add_two), inputs:(X, Y), outputs:(add_two@OUT@0). - Op(fc), inputs:(add_two@OUT@0, w, @EMPTY@), outputs:(fc@OUT@1, @TEMP@fc@0). - Op(mul), inputs:(add_two@OUT@0, w), outputs:(@TEMP@fc@0). - Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc@OUT@1). -''', str(net)) - - net2 = Network() - tmp = net2.add_two(X="X", Y="Y") - self.assertTrue(isinstance(tmp, core.Variable)) - net2.complete_add_op() - self.assertEqual( - '''Op(plain_net), inputs:(X, Y), outputs:(add_two@OUT@2). - Op(add_two), inputs:(X, Y), outputs:(add_two@OUT@2). -''', str(net2)) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/v2/framework/tests/test_op_creation_methods.py b/python/paddle/v2/framework/tests/test_operator.py similarity index 54% rename from python/paddle/v2/framework/tests/test_op_creation_methods.py rename to python/paddle/v2/framework/tests/test_operator.py index 41db7c0d535aa920b34d6cc346090a8c15bfb110..4f164e1a69e3fd0409f9b575a8bd9b4e423b486b 100644 --- a/python/paddle/v2/framework/tests/test_op_creation_methods.py +++ b/python/paddle/v2/framework/tests/test_operator.py @@ -1,14 +1,14 @@ import unittest -import paddle.v2.framework.create_op_creation_methods as creation +import paddle.v2.framework.op as op import paddle.v2.framework.core as core import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2 import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2 -import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2 +import paddle.v2.framework.proto.attribute_pb2 as attribute_pb2 class TestGetAllProtos(unittest.TestCase): def test_all(self): - all_protos = creation.get_all_op_protos() + all_protos = op.get_all_op_protos() self.assertNotEqual(0, len(all_protos)) for each in all_protos: @@ -17,25 +17,25 @@ class TestGetAllProtos(unittest.TestCase): class TestOpDescCreationMethod(unittest.TestCase): def test_plain_input_output(self): - op = op_proto_pb2.OpProto() - op.type = "test" - ipt = op.inputs.add() + op_proto = op_proto_pb2.OpProto() + op_proto.type = "test" + ipt = op_proto.inputs.add() ipt.name = "X" ipt.comment = "not matter" - ipt = op.inputs.add() + ipt = op_proto.inputs.add() ipt.name = "Y" ipt.comment = "not matter" - opt = op.outputs.add() + opt = op_proto.outputs.add() opt.name = "Z" opt.comment = "not matter" - op.comment = "not matter" + op_proto.comment = "not matter" - self.assertTrue(op.IsInitialized()) + self.assertTrue(op_proto.IsInitialized()) - method = creation.OpDescCreationMethod(op) + method = op.OpDescCreationMethod(op_proto) output = method(X="a", Y="b", Z="c") expected = op_desc_pb2.OpDesc() @@ -45,29 +45,29 @@ class TestOpDescCreationMethod(unittest.TestCase): self.assertEqual(expected, output) def test_multiple_input_plain_output(self): - op = op_proto_pb2.OpProto() - op.type = "fc" - ipt = op.inputs.add() + op_proto = op_proto_pb2.OpProto() + op_proto.type = "fc" + ipt = op_proto.inputs.add() ipt.name = "X" ipt.comment = "" ipt.multiple = True - ipt = op.inputs.add() + ipt = op_proto.inputs.add() ipt.name = "W" ipt.comment = "" ipt.multiple = True - ipt = op.inputs.add() + ipt = op_proto.inputs.add() ipt.name = "b" ipt.comment = "" - out = op.outputs.add() + out = op_proto.outputs.add() out.name = "Y" out.comment = "" - op.comment = "" - self.assertTrue(op.IsInitialized()) - method = creation.OpDescCreationMethod(op) + op_proto.comment = "" + self.assertTrue(op_proto.IsInitialized()) + method = op.OpDescCreationMethod(op_proto) generated1 = method(X="x", W="w", b="b", Y="y") expected1 = op_desc_pb2.OpDesc() @@ -76,7 +76,7 @@ class TestOpDescCreationMethod(unittest.TestCase): expected1.type = 'fc' attr = expected1.attrs.add() attr.name = 'input_format' - attr.type = attr_type_pb2.INTS + attr.type = attribute_pb2.INTS attr.ints.extend([0, 1, 2, 3]) self.assertEqual(expected1, generated1) @@ -88,34 +88,34 @@ class TestOpDescCreationMethod(unittest.TestCase): expected2.type = 'fc' attr = expected2.attrs.add() attr.name = 'input_format' - attr.type = attr_type_pb2.INTS + attr.type = attribute_pb2.INTS attr.ints.extend([0, 3, 6, 7]) self.assertEqual(expected2, generated2) def test_attrs(self): - op = op_proto_pb2.OpProto() - op.type = "test" - ipt = op.inputs.add() + op_proto = op_proto_pb2.OpProto() + op_proto.type = "test" + ipt = op_proto.inputs.add() ipt.name = 'X' ipt.comment = "" def __add_attr__(name, type): - attr = op.attrs.add() + attr = op_proto.attrs.add() attr.name = name attr.comment = "" attr.type = type - __add_attr__("int_attr", attr_type_pb2.INT) - __add_attr__("float_attr", attr_type_pb2.FLOAT) - __add_attr__("string_attr", attr_type_pb2.STRING) - __add_attr__("ints_attr", attr_type_pb2.INTS) - __add_attr__("floats_attr", attr_type_pb2.FLOATS) - __add_attr__("strings_attr", attr_type_pb2.STRINGS) + __add_attr__("int_attr", attribute_pb2.INT) + __add_attr__("float_attr", attribute_pb2.FLOAT) + __add_attr__("string_attr", attribute_pb2.STRING) + __add_attr__("ints_attr", attribute_pb2.INTS) + __add_attr__("floats_attr", attribute_pb2.FLOATS) + __add_attr__("strings_attr", attribute_pb2.STRINGS) - op.comment = "" - self.assertTrue(op.IsInitialized()) + op_proto.comment = "" + self.assertTrue(op_proto.IsInitialized()) - method = creation.OpDescCreationMethod(op) + method = op.OpDescCreationMethod(op_proto) generated = method( X="a", @@ -131,119 +131,68 @@ class TestOpDescCreationMethod(unittest.TestCase): expected.inputs.extend(['a']) attr = expected.attrs.add() attr.name = "int_attr" - attr.type = attr_type_pb2.INT + attr.type = attribute_pb2.INT attr.i = 10 attr = expected.attrs.add() attr.name = "float_attr" - attr.type = attr_type_pb2.FLOAT + attr.type = attribute_pb2.FLOAT attr.f = 3.2 attr = expected.attrs.add() attr.name = "string_attr" - attr.type = attr_type_pb2.STRING + attr.type = attribute_pb2.STRING attr.s = "test_str" attr = expected.attrs.add() attr.name = "ints_attr" - attr.type = attr_type_pb2.INTS + attr.type = attribute_pb2.INTS attr.ints.extend([0, 1, 2, 3, 4]) attr = expected.attrs.add() attr.name = 'floats_attr' - attr.type = attr_type_pb2.FLOATS + attr.type = attribute_pb2.FLOATS attr.floats.extend([0.2, 3.2, 4.5]) attr = expected.attrs.add() attr.name = 'strings_attr' - attr.type = attr_type_pb2.STRINGS + attr.type = attribute_pb2.STRINGS attr.strings.extend(['a', 'b', 'c']) self.assertEqual(expected, generated) def test_input_temporary_output(self): - op = op_proto_pb2.OpProto() - op.type = "test" - out = op.outputs.add() + op_proto = op_proto_pb2.OpProto() + op_proto.type = "test" + out = op_proto.outputs.add() out.name = "OUT" out.comment = "" - out = op.outputs.add() + out = op_proto.outputs.add() out.name = "TMP" out.comment = "" out.temporary = True - out = op.outputs.add() + out = op_proto.outputs.add() out.name = "OUT2" out.comment = "" - op.comment = "" + op_proto.comment = "" - method = creation.OpDescCreationMethod(op) + method = op.OpDescCreationMethod(op_proto) generated = method(OUT="a", OUT2="b") desc = op_desc_pb2.OpDesc() desc.outputs.extend(["a", core.var_names.temp(), "b"]) desc.type = "test" attr = desc.attrs.add() attr.name = "temporary_index" - attr.type = attr_type_pb2.INTS + attr.type = attribute_pb2.INTS attr.ints.append(2) self.assertEqual(generated, desc) -class TestOpCreationDocStr(unittest.TestCase): - def test_all(self): - op = op_proto_pb2.OpProto() - op.type = "test" - op.comment = """Test Op. - -This op is used for unit test, not a real op. -""" - a = op.inputs.add() - a.name = "a" - a.comment = "Input a for test op" - a.multiple = True - - b = op.inputs.add() - b.name = "b" - b.comment = "Input b for test op" - self.assertTrue(op.IsInitialized()) - - o1 = op.outputs.add() - o1.name = "output" - o1.comment = "The output of test op" - - o2 = op.outputs.add() - o2.name = "temp output" - o2.comment = "The temporary output of test op" - o2.temporary = True - - test_str = op.attrs.add() - test_str.name = "str_attr" - test_str.type = attr_type_pb2.STRING - test_str.comment = "A string attribute for test op" - - actual = creation.get_docstring_from_op_proto(op) - expected_docstring = '''Test Op. - -This op is used for unit test, not a real op. - -:param a: Input a for test op -:type a: list | basestr -:param b: Input b for test op -:type b: basestr -:param output: The output of test op -:type output: basestr -:param temp output: This is a temporary variable. It does not have to set by user. The temporary output of test op -:type temp output: basestr -:param str_attr: A string attribute for test op -:type str_attr: basestr -''' - self.assertEqual(expected_docstring, actual) - - class TestOpCreations(unittest.TestCase): def test_all(self): - add_op = creation.op_creations.add_two(X="a", Y="b", Out="z") + add_op = op.Operator("add_two", X="a", Y="b", Out="z") self.assertIsNotNone(add_op) # Invoke C++ DebugString() self.assertEqual('Op(add_two), inputs:(a, b), outputs:(z).', diff --git a/python/paddle/v2/framework/tests/test_protobuf.py b/python/paddle/v2/framework/tests/test_protobuf.py index b8702477e64203e735bff05b115eafbb2a52172d..69e98e2f250a9df23b25e7e2043af29f87c996a0 100644 --- a/python/paddle/v2/framework/tests/test_protobuf.py +++ b/python/paddle/v2/framework/tests/test_protobuf.py @@ -1,12 +1,10 @@ -import paddle.v2.framework.proto.op_proto_pb2 -import paddle.v2.framework.proto.attr_type_pb2 +import paddle.v2.framework.proto.op_proto_pb2 as op_proto_lib +import paddle.v2.framework.proto.attribute_pb2 as attr_type_lib import unittest class TestFrameworkProto(unittest.TestCase): def test_all(self): - op_proto_lib = paddle.v2.framework.proto.op_proto_pb2 - attr_type_lib = paddle.v2.framework.proto.attr_type_pb2 op_proto = op_proto_lib.OpProto() ipt0 = op_proto.inputs.add() ipt0.name = "a" diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py index 0457e3f16a709140180ce433c1d56d146f0b6974..5c77c477b347f4713e4af2a8cb462b243d7a779c 100644 --- a/python/paddle/v2/framework/tests/test_recurrent_op.py +++ b/python/paddle/v2/framework/tests/test_recurrent_op.py @@ -1,3 +1,4 @@ +import logging import paddle.v2.framework.core as core import unittest import numpy as np @@ -7,10 +8,9 @@ ops = creation.op_creations def create_tensor(scope, name, shape): - tensor = scope.create_var(name).get_tensor() + tensor = scope.new_var(name).get_tensor() tensor.set_dims(shape) - tensor.alloc_float() - tensor.set(np.random.random(shape)) + tensor.set(np.random.random(shape), core.CPUPlace()) return tensor @@ -31,40 +31,36 @@ class TestRNN(unittest.TestCase): - h ''' + input_dim = 30 + batch_size = 50 + weight_dim = 15 + sent_len = 11 + def init(self): - input_dim = 30 - batch_size = 50 - weight_dim = 15 - - self.scope = core.Scope(None) - - # create vars - create_tensor(self.scope, "x", [batch_size, input_dim]) - create_tensor(self.scope, "W", [input_dim, weight_dim]) - create_tensor(self.scope, "U", [weight_dim, weight_dim]) - create_tensor(self.scope, "h_boot", [batch_size, weight_dim]) - - x_alias = "x@alias" - y_alias = "y@alias" - memory = "h@alias" - prememory = "h@pre" - output = "rnn_out" - output_alias = "rnn_out@alias" - - # create step net - stepnet_var = self.scope.create_var("stepnet") - stepnet = stepnet_var.get_net() - # stepnet = core.Net.create() - x_fc_op = ops.fc(X=x_alias, W="W", Y="Wx") - h_fc_op = ops.fc(X=prememory, W="U", Y="Uh") - sum_op = ops.add_two(X="Wx", Y="Uh", Out="sum") - sig_op = ops.sigmoid(X="sum", Y=memory) - stepnet.add_op(x_fc_op) - stepnet.add_op(h_fc_op) - stepnet.add_op(sum_op) - stepnet.add_op(sig_op) - stepnet.complete_add_op(True) + self.scope = core.Scope() + + self.create_global_variables() + self.create_step_net() + rnn_op = self.create_rnn_op() + ctx = core.DeviceContext.create(core.CPUPlace()) + print 'infer_shape' + rnn_op.infer_shape(self.scope) + + rnn_op.run(self.scope, ctx) + + def create_global_variables(self): + # create inlink + create_tensor(self.scope, "x", + [self.sent_len, self.batch_size, self.input_dim]) + create_tensor(self.scope, "W", [self.input_dim, self.input_dim]) + create_tensor(self.scope, "U", [self.input_dim, self.input_dim]) + create_tensor(self.scope, "h_boot", [self.batch_size, self.input_dim]) + self.scope.new_var("step_scopes") + self.scope.new_var("h@alias") + self.scope.new_var("h") + + def create_rnn_op(self): # create RNNOp rnnop = ops.recurrent_op( # inputs @@ -72,17 +68,27 @@ class TestRNN(unittest.TestCase): boot_memories=["h_boot"], step_net="stepnet", # outputs - outlinks=[output], + outlinks=["h"], step_scopes="step_scopes", # attributes inlink_alias=["x@alias"], - outlink_alias=[output_alias], - pre_memories=[prememory], - memories=[memory]) + outlink_alias=["h@alias"], + pre_memories=["h@pre"], + memories=["h@alias"]) + return rnnop + + def create_step_net(self): + var = self.scope.new_var("stepnet") + stepnet = var.get_net() - ctx = core.DeviceContext.cpu_context() - rnnop.infer_shape(self.scope) - rnnop.run(self.scope, ctx) + x_fc_op = ops.fc(X="x@alias", W="W", Y="Wx") + h_fc_op = ops.fc(X="h@pre", W="U", Y="Uh") + sum_op = ops.add_two(X="Wx", Y="Uh", Out="sum") + sig_op = ops.sigmoid(X="sum", Y="h@alias") + + for op in [x_fc_op, h_fc_op, sum_op, sig_op]: + stepnet.add_op(op) + stepnet.complete_add_op(True) def test_recurrent(self): self.init() diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py index ef1514983c03f822f84b85437d1cfe653b6a1a2e..f8521eb517057fbeb104b28af7da4fffe54f37de 100644 --- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py +++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py @@ -8,9 +8,11 @@ class TestRowwiseAddOp(unittest.TestCase): def setUp(self): self.type = "rowwise_add" - self.X = np.random.random((32, 784)).astype("float32") - self.b = np.random.random(784).astype("float32") - self.Out = np.add(self.X, self.b) + self.inputs = { + 'X': np.random.random((32, 84)).astype("float32"), + 'b': np.random.random(84).astype("float32") + } + self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])} if __name__ == '__main__': diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/framework/tests/test_sgd_op.py index 405d73b224fa153e50b4ec408a921f2bdaab46aa..e5f9ef865e84f1a78e28884ad7e2e758f9ca8054 100644 --- a/python/paddle/v2/framework/tests/test_sgd_op.py +++ b/python/paddle/v2/framework/tests/test_sgd_op.py @@ -8,10 +8,13 @@ class TestSGD(unittest.TestCase): def setUp(self): self.type = "sgd" - self.param = numpy.random.random((342, 345)).astype("float32") - self.grad = numpy.random.random((342, 345)).astype("float32") - self.learning_rate = 0.1 - self.param_out = self.param - self.learning_rate * self.grad + w = numpy.random.random((102, 105)).astype("float32") + g = numpy.random.random((102, 105)).astype("float32") + lr = 0.1 + + self.inputs = {'param': w, 'grad': g} + self.attrs = {'learning_rate': lr} + self.outputs = {'param_out': w - lr * g} if __name__ == "__main__": diff --git a/python/paddle/v2/framework/tests/test_sigmoid_op.py b/python/paddle/v2/framework/tests/test_sigmoid_op.py index 50044a122f1d66dd54a24f6cce76074a60ee2262..2a57a41ed8b718fd420062ba68e853a4861b7359 100644 --- a/python/paddle/v2/framework/tests/test_sigmoid_op.py +++ b/python/paddle/v2/framework/tests/test_sigmoid_op.py @@ -8,9 +8,12 @@ class TestSigmoidOp(unittest.TestCase): def setUp(self): self.type = "sigmoid" - self.X = np.random.random((32, 100)).astype("float32") - self.Y = 1 / (1 + np.exp(-self.X)) + self.inputs = {'X': np.random.random((32, 100)).astype("float32")} + self.outputs = {'Y': 1 / (1 + np.exp(-self.inputs['X']))} +#class TestSigmoidGradOp(unittest.TestCase): +#TODO(qingqing) add unit test + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_softmax_op.py b/python/paddle/v2/framework/tests/test_softmax_op.py index 191b698c1cdec9b86b4ded6b1f743586867ca62f..e670d93653e07d35e5019c9daac45c214eddf367 100644 --- a/python/paddle/v2/framework/tests/test_softmax_op.py +++ b/python/paddle/v2/framework/tests/test_softmax_op.py @@ -1,7 +1,10 @@ import unittest -from op_test_util import OpTestMeta + import numpy as np +from gradient_checker import GradientChecker, create_op +from op_test_util import OpTestMeta + def stable_softmax(x): """Compute the softmax of vector x in a numerically stable way.""" @@ -15,8 +18,17 @@ class TestSoftmaxOp(unittest.TestCase): def setUp(self): self.type = "softmax" - self.X = np.random.random((32, 100)).astype("float32") - self.Y = np.apply_along_axis(stable_softmax, 1, self.X) + self.inputs = {'X': np.random.random((32, 100)).astype("float32")} + self.outputs = { + 'Y': np.apply_along_axis(stable_softmax, 1, self.inputs['X']) + } + + +class SoftmaxGradOpTest(GradientChecker): + def test_softmax(self): + op = create_op("softmax") + inputs = {"X": np.random.uniform(0.1, 1, [10, 10]).astype("float32")} + self.check_grad(op, inputs, set("X"), "Y") if __name__ == '__main__': diff --git a/python/paddle/v2/framework/tests/test_tensor.py b/python/paddle/v2/framework/tests/test_tensor.py index 6d59863cea29832f648139e07a134050e22bfa21..1af39818a305215b45219b8c5f0a10630fd64279 100644 --- a/python/paddle/v2/framework/tests/test_tensor.py +++ b/python/paddle/v2/framework/tests/test_tensor.py @@ -7,16 +7,17 @@ class TestScope(unittest.TestCase): def test_int_tensor(self): scope = core.Scope() var = scope.new_var("test_tensor") + place = core.CPUPlace() + tensor = var.get_tensor() tensor.set_dims([1000, 784]) - tensor.alloc_int() - + tensor.alloc_int(place) tensor_array = numpy.array(tensor) self.assertEqual((1000, 784), tensor_array.shape) tensor_array[3, 9] = 1 tensor_array[19, 11] = 2 - tensor.set(tensor_array) + tensor.set(tensor_array, place) tensor_array_2 = numpy.array(tensor) self.assertEqual(1.0, tensor_array_2[3, 9]) @@ -25,16 +26,18 @@ class TestScope(unittest.TestCase): def test_float_tensor(self): scope = core.Scope() var = scope.new_var("test_tensor") + place = core.CPUPlace() + tensor = var.get_tensor() tensor.set_dims([1000, 784]) - tensor.alloc_float() + tensor.alloc_float(place) tensor_array = numpy.array(tensor) self.assertEqual((1000, 784), tensor_array.shape) tensor_array[3, 9] = 1.0 tensor_array[19, 11] = 2.0 - tensor.set(tensor_array) + tensor.set(tensor_array, place) tensor_array_2 = numpy.array(tensor) self.assertAlmostEqual(1.0, tensor_array_2[3, 9]) diff --git a/python/paddle/v2/framework/tests/test_uniform_random_op.py b/python/paddle/v2/framework/tests/test_uniform_random_op.py new file mode 100644 index 0000000000000000000000000000000000000000..c3d2bb44da3977c0899b2609a8efe15b7e1789f2 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py @@ -0,0 +1,35 @@ +import unittest +from paddle.v2.framework.op import Operator +import paddle.v2.framework.core as core +import numpy + + +class UniformRandomTest(unittest.TestCase): + def test_uniform_random_cpu(self): + self.uniform_random_test(place=core.CPUPlace()) + + def test_uniform_random_gpu(self): + if core.is_compile_gpu(): + self.uniform_random_test(place=core.GPUPlace(0)) + + def uniform_random_test(self, place): + scope = core.Scope() + scope.new_var("X").get_tensor() + + op = Operator( + "uniform_random", + Out="X", + dims=[1000, 784], + min=-5.0, + max=10.0, + seed=10) + + op.infer_shape(scope) + ctx = core.DeviceContext.create(place) + op.run(scope, ctx) + tensor = numpy.array(scope.find_var("X").get_tensor()) + self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/plot/tests/CMakeLists.txt b/python/paddle/v2/plot/tests/CMakeLists.txt index da5cd764889b48a3af8461a2793d948aa609d6c1..4b6c1c80969182ccf6e0189b18bade8758bbbc30 100644 --- a/python/paddle/v2/plot/tests/CMakeLists.txt +++ b/python/paddle/v2/plot/tests/CMakeLists.txt @@ -1,5 +1,5 @@ if (NOT APPLE) # The Mac OS X backend will not be able to function correctly if Python is # not installed as a framework. - add_python_test(test_ploter test_ploter.py) + py_test(test_ploter SRCS test_ploter.py) endif() diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt index 6a1d337b232c7a849a8793894bf16d26d609d3dd..107d5912e1567e0c8721987a281272c7feb51e63 100644 --- a/python/paddle/v2/reader/tests/CMakeLists.txt +++ b/python/paddle/v2/reader/tests/CMakeLists.txt @@ -1 +1,2 @@ -add_python_test(reader_tests creator_test.py decorator_test.py) +py_test(creator_test SRCS creator_test.py) +py_test(decorator_test SRCS decorator_test.py) diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt index 058f22befd0657d06ff130ace55fe7322148213d..b7791559594321a85f41b508b69efeb077d69595 100644 --- a/python/paddle/v2/tests/CMakeLists.txt +++ b/python/paddle/v2/tests/CMakeLists.txt @@ -1,2 +1,7 @@ -add_python_test(test_v2_api test_data_feeder.py test_op.py test_parameters.py -test_layer.py test_rnn_layer.py test_topology.py test_image.py) +py_test(test_op SRCS test_op.py) +py_test(test_image SRCS test_image.py) +py_test(test_layer SRCS test_layer.py) +py_test(test_topology SRCS test_topology.py) +py_test(test_rnn_layer SRCS test_rnn_layer.py) +py_test(test_parameters SRCS test_parameters.py) +py_test(test_data_feeder SRCS test_data_feeder.py) diff --git a/python/setup.py.in b/python/setup.py.in index 65a26940d4d703ea4fbb5022523a90716982ec10..38f0a503bee3eb29ae3c893c96d6e333be54b96e 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -1,4 +1,8 @@ -from setuptools import setup +from setuptools import setup, Distribution + +class BinaryDistribution(Distribution): + def has_ext_modules(foo): + return True packages=['paddle', 'paddle.proto', @@ -11,33 +15,44 @@ packages=['paddle', 'paddle.v2.master', 'paddle.v2.plot', 'paddle.v2.framework', - 'paddle.v2.framework.proto'] + 'paddle.v2.framework.proto', + 'py_paddle'] setup_requires=["requests", - "numpy", + "numpy>=1.12", "protobuf==3.1", "recordio", "matplotlib", "rarfile", "scipy>=0.19.0", "Pillow", - "nltk"] + "nltk>=3.2.2"] if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: setup_requires+=["opencv-python"] -setup(name='paddle', +setup(name='paddlepaddle', version='${PADDLE_VERSION}', description='Parallel Distributed Deep Learning', install_requires=setup_requires, packages=packages, - package_data={'paddle.v2.master': ['libpaddle_master.so'], - 'paddle.v2.framework': ['core.so'] + package_data={ + 'paddle.v2.master': ['libpaddle_master.so'], + 'paddle.v2.framework': ['core.so'], + 'py_paddle':['*.py','_swig_paddle.so'] }, package_dir={ '': '${CMAKE_CURRENT_SOURCE_DIR}', # The paddle.v2.framework.proto will be generated while compiling. # So that package points to other directory. - 'paddle.v2.framework.proto': '${PROJ_BINARY_ROOT}/paddle/framework' + 'paddle.v2.framework.proto': '${PROJ_BINARY_ROOT}/paddle/framework', + 'py_paddle': '${PROJ_ROOT}/paddle/py_paddle' }, + scripts=['${PROJ_BINARY_ROOT}/paddle/scripts/paddle'], + distclass=BinaryDistribution, + data_files=[('/usr/local/opt/paddle/bin', + ['${PROJ_BINARY_ROOT}/paddle/scripts/paddle_usage', + '${PROJ_BINARY_ROOT}/paddle/trainer/paddle_trainer', + '${PROJ_BINARY_ROOT}/paddle/trainer/paddle_merge_model', + '${PROJ_BINARY_ROOT}/paddle/pserver/paddle_pserver_main'])] )