diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 980a97a07c996eca2e8c126a6ad5ab7f340fa1e5..bb8c88787d37faf9ce4d7d856a307c11f1085d98 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,10 +17,14 @@ - id: detect-private-key files: (?!.*third_party)^.*$ | (?!.*book)^.*$ - id: end-of-file-fixer -- repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git - sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29 +- repo: local hooks: - - id: clang-formater + - id: clang-format + name: clang-format + description: Format files with ClangFormat. + entry: clang-format -i + language: system + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ - repo: https://github.com/PaddlePaddle/pre-commit-golang sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 hooks: diff --git a/Dockerfile b/Dockerfile index 8cfb16928c95dcbfac08383d32562ff67933d873..156ad3552b2c4ff90b405c35c66d44117c2624a4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,13 +27,16 @@ RUN apt-get update && \ git python-pip python-dev openssh-server bison \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ curl sed grep graphviz libjpeg-dev zlib1g-dev \ - python-numpy python-matplotlib gcc g++ \ + python-matplotlib gcc-4.8 g++-4.8 \ automake locales clang-format-3.8 swig doxygen cmake \ liblapack-dev liblapacke-dev libboost-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ net-tools && \ apt-get clean -y +# paddle is using numpy.flip, which is introduced since 1.12.0 +RUN pip --no-cache-dir install 'numpy>=1.12.0' + # Install Go and glide RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \ tar -C /usr/local -xzf go.tgz && \ diff --git a/README.md b/README.md index 2a6beeb342b34f8e91ef509d7d41f286a666480c..b9793c3eab5d40c28f01cc67ad607b97261b3235 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ We provide [English](http://doc.paddlepaddle.org/develop/doc/) and - [Deep Learning 101](http://book.paddlepaddle.org/index.html) - You might want to start from the this online interactive book that can run in Jupyter Notebook. + You might want to start from this online interactive book that can run in Jupyter Notebook. - [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html) diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake index 656e1a0803c6e389d70f37f592c3aa2e95a2bcd4..e50530411cc74392091c8026fa012ec7631f7f6b 100644 --- a/cmake/cpplint.cmake +++ b/cmake/cpplint.cmake @@ -56,11 +56,14 @@ macro(add_style_check_target TARGET_NAME) # cpplint code style get_filename_component(base_filename ${filename} NAME) set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint) - add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD + add_custom_command(OUTPUT ${CUR_GEN} PRE_BUILD COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py" "--filter=${STYLE_FILTER}" "--write-success=${CUR_GEN}" ${filename} + DEPENDS ${filename} ${PROJ_ROOT}/paddle/scripts/cpplint.py WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + add_custom_target(${base_filename}.cpplint DEPENDS ${CUR_GEN}) + add_dependencies(${TARGET_NAME} ${base_filename}.cpplint) endif() endforeach() endif() diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index eff15de73f23db6dea3a7b79006bfec90d712ae5..25c6b4ef52d3f8ebff1572ae8d348be7c577c08c 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -20,34 +20,30 @@ INCLUDE(ExternalProject) SET(MKLDNN_PROJECT "extern_mkldnn") SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn) -SET(MKLDNN_INSTALL_ROOT ${CMAKE_INSTALL_PREFIX}) -IF(NOT "$ENV{HOME}" STREQUAL "/root") - SET(MKLDNN_INSTALL_ROOT "$ENV{HOME}") -ENDIF() - -SET(MKLDNN_INSTALL_DIR "${MKLDNN_INSTALL_ROOT}/opt/paddle/third_party/mkldnn") -SET(MKLDNN_INCLUDE_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) +SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) +SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) -IF(WIN32) - MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet." - "Force WITH_MKLDNN=OFF") - SET(WITH_MKLDNN OFF) +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with MKLDNN in Paddle yet." + "Force WITH_MKLDNN=OFF") + SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE) return() -ELSE(WIN32) - SET(MKLDNN_LIBRARY "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) - MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path") - SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) - #SET(CMAKE_MACOSX_RPATH 1) # hold for MacOS - SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") -ENDIF(WIN32) +ENDIF() + +SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) +MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path") +SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") -INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR}) +INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) IF(${CBLAS_PROVIDER} STREQUAL "MKLML") SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) SET(MKLDNN_MKLROOT ${MKLML_ROOT}) SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB}) SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR}) + MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}") ENDIF() ExternalProject_Add( @@ -57,16 +53,15 @@ ExternalProject_Add( GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" GIT_TAG "v0.9" PREFIX ${MKLDNN_SOURCES_DIR} - CONFIGURE_COMMAND mkdir -p /build - BUILD_COMMAND cd /build - && cmake .. -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKLDNN_MKLROOT} - && $(MAKE) - INSTALL_COMMAND cd /build && $(MAKE) install UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} + CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} + -DMKLROOT:PATH=${MKLDNN_MKLROOT} ) ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY}) +SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) -MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIBRARY}") +MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}") LIST(APPEND external_project_dependencies mkldnn) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 3f940756a4abb79aba7d3561db19db8532a0b673..17a1ca4ed04dce85ae3c7fdd5f22d6eeed03db59 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -16,19 +16,23 @@ IF(NOT ${WITH_MKLML}) return() ENDIF(NOT ${WITH_MKLML}) +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with MKLML in Paddle yet." + "Force WITH_MKLML=OFF") + SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE) + return() +ENDIF() + INCLUDE(ExternalProject) SET(MKLML_PROJECT "extern_mklml") -SET(MKLML_VER "mklml_lnx_2018.0.20170425") +SET(MKLML_VER "mklml_lnx_2018.0.20170720") SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz") SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") -SET(MKLML_DST_DIR "opt/paddle/third_party/mklml") -SET(MKLML_INSTALL_ROOT "${CMAKE_INSTALL_PREFIX}") -IF(NOT "$ENV{HOME}" STREQUAL "/root") - SET(MKLML_INSTALL_ROOT "$ENV{HOME}") -ENDIF() - +SET(MKLML_DST_DIR "mklml") +SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER}) SET(MKLML_INC_DIR ${MKLML_ROOT}/include) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index 67a359d4b5f4cca8fc8e74eab4d4acb4cc12baed..490c87d67ed79a238dd506127cd4d9855fab6626 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -24,7 +24,6 @@ IF(WITH_PYTHON) ENDIF(WITH_PYTHON) SET(py_env "") -SET(USE_VIRTUALENV_FOR_TEST 1) IF(PYTHONINTERP_FOUND) find_python_module(pip REQUIRED) find_python_module(numpy REQUIRED) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index ef31c252038ce18655913c0f41343fe6dc7dbb86..d00a9bb3a30cfb16623e073414088059481c3e1a 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -9,6 +9,11 @@ function(CheckCompilerCXX11Flag) if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8) message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.") endif() + # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem. + # Use Debug mode instead for now. + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE) + endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang" # Apple Clang is a different compiler than upstream Clang which havs different version numbers. diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 534be0abe246ac70950d85ad05441825c8ca768a..41b9b5928958ae31799c396a8d77fd7cff557905 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -187,7 +187,13 @@ function(cc_library TARGET_NAME) endif() # cpplint code style - add_style_check_target(${TARGET_NAME} ${cc_library_SRCS}) + foreach(source_file ${cc_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS}) else(cc_library_SRCS) if (cc_library_DEPS) @@ -239,6 +245,14 @@ function(nv_library TARGET_NAME) add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) target_link_libraries(${TARGET_NAME} ${nv_library_DEPS}) endif() + # cpplint code style + foreach(source_file ${nv_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS}) else(nv_library_SRCS) if (nv_library_DEPS) merge_static_libs(${TARGET_NAME} ${nv_library_DEPS}) diff --git a/cmake/util.cmake b/cmake/util.cmake index 87ad9d91d8701c56255c1e7f224764998df634a7..4a27623b7ffc0b389680baee52db440c78442f46 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -118,7 +118,6 @@ endfunction() macro(add_unittest_without_exec TARGET_NAME) add_executable(${TARGET_NAME} ${ARGN}) link_paddle_test(${TARGET_NAME}) - add_style_check_target(${TARGET_NAME} ${ARGN}) endmacro() # add_unittest @@ -150,9 +149,12 @@ endfunction() # Create a python unittest using run_python_tests.sh, # which takes care of making correct running environment function(add_python_test TEST_NAME) - add_test(NAME ${TEST_NAME} - COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR} - bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh - ${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + foreach(arg ${ARGN}) + get_filename_component(py_fn ${arg} NAME_WE) + set(TRG_NAME ${TEST_NAME}_${py_fn}) + add_test(NAME ${TRG_NAME} + COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR} + python2 ${arg} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endforeach() endfunction() diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index daee55b7f9adfffdf709ed2b5b0d957c7ca1aea4..372272a53c12c314fc80eebbce5eae9fcabc55ba 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -104,6 +104,11 @@ cross_channel_norm ------------------ .. autoclass:: paddle.v2.layer.cross_channel_norm :noindex: + +row_l2_norm +----------- +.. autoclass:: paddle.v2.layer.row_l2_norm + :noindex: Recurrent Layers ================ @@ -198,6 +203,10 @@ identity_projection .. autoclass:: paddle.v2.layer.identity_projection :noindex: +slice_projection +------------------- +.. autoclass:: paddle.v2.layer.slice_projection + :noindex: table_projection ---------------- @@ -316,6 +325,11 @@ scaling .. autoclass:: paddle.v2.layer.scaling :noindex: +clip +---- +.. autoclass:: paddle.v2.layer.clip + :noindex: + slope_intercept --------------- .. autoclass:: paddle.v2.layer.slope_intercept diff --git a/doc/design/scope.md b/doc/design/scope.md index afe6bc028cafc5ee24b0041905857af58d3f5790..c9e0be716b606f6c7bf0373e0c6e632647e07a6f 100644 --- a/doc/design/scope.md +++ b/doc/design/scope.md @@ -37,8 +37,8 @@ Scope is an association of a name to variable. All variables belong to `Scope`. ```cpp class Scope { public: - Variable* CreateVariable(const std::string& name); - const Variable* GetVariable(const std::string& name) const; + Variable* NewVar(const std::string& name); + const Variable* FindVar(const std::string& name) const; private: std::unordered_map> vars_; @@ -58,12 +58,12 @@ class Scope { public: Scope(const std::shared_ptr& scope): parent_(scope) {} - Variable* GetVariable(const std::string& name) const { + Variable* FindVar(const std::string& name) const { auto it = vars_.find(name); if (it != vars_.end()) { return it->second.get(); } else if (parent_ != nullptr) { - return parent_->GetVariable(name); + return parent_->FindVar(name); } else { return nullptr; } @@ -95,10 +95,10 @@ class Scope { static std::shared_ptr Create(const std::shared_ptr& parent = nullptr); // return nullptr if not found. - Variable* GetVariable(const std::string& name) const; + Variable* FindVar(const std::string& name) const; // return if already contains same name variable. - Variable* CreateVariable(const std::string& name); + Variable* NewVar(const std::string& name); private: std::shared_ptr parent_; @@ -107,11 +107,11 @@ class Scope { ``` ## Only scope can create a variable -To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `CreateVariable` can construct `Variable`. +To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`. ## When scope destroyed, all variables inside this scope should be destroyed together -The scope hold unique pointers for all variables. User can `GetVariable` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together. +The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together. ## Sharing a parent scope @@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar ## Orthogonal interface -`GetVariable` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `CreateVariable` will return a `Error` when there is a name conflict locally. Combine `GetVariable` and `CreateVariable`, we can implement `CreateOrGetVariable` easily. +`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return a `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily. diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py index 85cb399590f7a5e7e73285ca87c49ea5f24afb32..572a61e4ccaa9ef3d03a60d916e80eab907c6d88 100644 --- a/go/pserver/client/c/test/test_train.py +++ b/go/pserver/client/c/test/test_train.py @@ -3,24 +3,11 @@ import paddle.v2.dataset.uci_housing as uci_housing import paddle.v2.master as master import os import cPickle as pickle +from paddle.v2.reader.creator import cloud_reader etcd_ip = os.getenv("MASTER_IP", "127.0.0.1") -etcd_endpoint = "http://" + etcd_ip + ":2379" -print "connecting to master, etcd endpoints: ", etcd_endpoint -master_client = master.client(etcd_endpoint, 5, 64) - - -def cloud_reader(): - global master_client - master_client.set_dataset( - ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*"], passes=30) - while 1: - r, e = master_client.next_record() - if not r: - if e != -2: # other errors - print "get record error:", e - break - yield pickle.loads(r) +etcd_endpoints = "http://" + etcd_ip + ":2379" +print "etcd endpoints: ", etcd_endpoints def main(): @@ -49,7 +36,7 @@ def main(): parameters=parameters, update_equation=optimizer, is_local=False, - pserver_spec=etcd_endpoint, + pserver_spec=etcd_endpoints, use_etcd=True) # event_handler to print training and testing info @@ -75,7 +62,11 @@ def main(): trainer.train( reader=paddle.batch( paddle.reader.shuffle( - cloud_reader, buf_size=500), batch_size=2), + cloud_reader( + ["/pfs/dlnel/public/dataset/uci_housing/uci_housing*"], + etcd_endpoints), + buf_size=500), + batch_size=2), feeding={'x': 0, 'y': 1}, event_handler=event_handler, diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh index fa7baccc86e0b56e57d52a40c95cfe1b98fececc..8fd58925ee4820269572176ff9496f42914652da 100755 --- a/paddle/.set_python_path.sh +++ b/paddle/.set_python_path.sh @@ -21,22 +21,15 @@ # # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...} # - -if ! python -c "import paddle" >/dev/null 2>/dev/null; then - PYPATH="" - set -x - while getopts "d:" opt; do - case $opt in - d) - PYPATH=$OPTARG - ;; - esac - done - shift $(($OPTIND - 1)) - export PYTHONPATH=$PYPATH:$PYTHONPATH - $@ -else - echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment." - echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'" - exit 1 -fi +PYPATH="" +set -x +while getopts "d:" opt; do + case $opt in + d) + PYPATH=$OPTARG + ;; + esac +done +shift $(($OPTIND - 1)) +export PYTHONPATH=$PYPATH:$PYTHONPATH +$@ diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 4b06966fba2bc9f92756be0cb8110bbcd5272423..f8a88cf317aee6c5dd25e4cc25d588c6c50fcbce 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -15,7 +15,6 @@ if(Boost_FOUND) add_subdirectory(platform) add_subdirectory(framework) add_subdirectory(operators) - add_subdirectory(pybind) endif() if(WITH_C_API) diff --git a/paddle/cuda/src/hl_batch_transpose.cu b/paddle/cuda/src/hl_batch_transpose.cu index f047403da17e66960f029f2fee7312210009c952..f4c253df7b4be937f041f18587efd4c9d693fbe4 100644 --- a/paddle/cuda/src/hl_batch_transpose.cu +++ b/paddle/cuda/src/hl_batch_transpose.cu @@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_batch_transpose.h" #include "hl_base.h" +#include "hl_batch_transpose.h" const int TILE_DIM = 64; const int BLOCK_ROWS = 16; // No bank-conflict transpose for a batch of data. -__global__ void batchTransposeNoBankConflicts(real* odata, - const real* idata, - int numSamples, int width, - int height) { +__global__ void batchTransposeNoBankConflicts( + real* odata, const real* idata, int numSamples, int width, int height) { __shared__ float tile[TILE_DIM][TILE_DIM + 1]; const int x = blockIdx.x * TILE_DIM + threadIdx.x; @@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata, newX] = tile[threadIdx.x][j]; } -void batchTranspose(const real* input, real* output, int width, int height, - int batchSize) { +void batchTranspose( + const real* input, real* output, int width, int height, int batchSize) { dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize); - batchTransposeNoBankConflicts<<>> - (output, input, batchSize, width, height); + batchTransposeNoBankConflicts<<>>( + output, input, batchSize, width, height); CHECK_SYNC("batchTranspose failed!"); } diff --git a/paddle/cuda/src/hl_cuda_aggregate.cu b/paddle/cuda/src/hl_cuda_aggregate.cu index 97034a917708487d1c5dc59e6ebbf45bad1c3227..16a54ad343fa140aa1f3bec311c4b712d0086082 100644 --- a/paddle/cuda/src/hl_cuda_aggregate.cu +++ b/paddle/cuda/src/hl_cuda_aggregate.cu @@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - +#include "hl_aggregate.h" #include "hl_base.h" #include "hl_cuda.h" #include "hl_cuda.ph" -#include "hl_aggregate.h" -#include "hl_thread.ph" #include "hl_matrix_base.cuh" +#include "hl_thread.ph" #include "paddle/utils/Logging.h" /** * @brief matrix row operator. */ -template -__global__ void KeMatrixRowOp(Agg agg, - real *E, - real *Sum, - int dimN) { +template +__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) { __shared__ real sum_s[blockSize]; - int cnt = (dimN + blockSize -1) / blockSize; - int rowId = blockIdx.x + blockIdx.y*gridDim.x; - int index = rowId*dimN; + int cnt = (dimN + blockSize - 1) / blockSize; + int rowId = blockIdx.x + blockIdx.y * gridDim.x; + int index = rowId * dimN; int tid = threadIdx.x; int lmt = tid; @@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg, sum_s[tid] = tmp; __syncthreads(); - for (int stride = blockSize/2; stride > 0; stride = stride/2) { + for (int stride = blockSize / 2; stride > 0; stride = stride / 2) { if (tid < stride) { sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]); } @@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg, } template -void hl_matrix_row_op(Agg agg, - real *A_d, - real *C_d, - int dimM, - int dimN) { +void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) { int blocksX = dimM; int blocksY = 1; dim3 threads(128, 1); dim3 grid(blocksX, blocksY); - KeMatrixRowOp<<< grid, threads, 0, STREAM_DEFAULT >>> - (agg, A_d, C_d, dimN); + KeMatrixRowOp<<>>( + agg, A_d, C_d, dimN); } void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_row_op(aggregate::sum(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_row_sum failed"); } @@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_row_op(aggregate::max(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_row_max failed"); } @@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_row_op(aggregate::min(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_row_min failed"); } /** * @brief matrix column operator. */ -template -__global__ void KeMatrixColumnOp(Agg agg, - real *E, - real *Sum, - int dimM, - int dimN) { +template +__global__ void KeMatrixColumnOp( + Agg agg, real *E, real *Sum, int dimM, int dimN) { int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; real tmp = agg.init(); if (rowIdx < dimN) { @@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg, } } -template -__global__ void KeMatrixColumnOp_S(Agg agg, - real *E, - real *Sum, - int dimM, - int dimN) { - __shared__ real _sum[blockDimX*blockDimY]; - int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; - int index = threadIdx.y; +template +__global__ void KeMatrixColumnOp_S( + Agg agg, real *E, real *Sum, int dimM, int dimN) { + __shared__ real _sum[blockDimX * blockDimY]; + int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; + int index = threadIdx.y; real tmp = agg.init(); if (rowIdx < dimN) { @@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg, index += blockDimY; } } - _sum[threadIdx.x + threadIdx.y*blockDimX] = tmp; + _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp; __syncthreads(); if (rowIdx < dimN) { - if (threadIdx.y ==0) { + if (threadIdx.y == 0) { real tmp = agg.init(); - for (int i=0; i < blockDimY; i++) { - tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]); + for (int i = 0; i < blockDimY; i++) { + tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]); } Sum[rowIdx] = tmp; } @@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg, } template -void hl_matrix_column_op(Agg agg, - real *A_d, - real *C_d, - int dimM, - int dimN) { +void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) { if (dimN >= 8192) { - int blocksX = (dimN + 128 -1) / 128; + int blocksX = (dimN + 128 - 1) / 128; int blocksY = 1; dim3 threads(128, 1); dim3 grid(blocksX, blocksY); - KeMatrixColumnOp<<< grid, threads, 0, STREAM_DEFAULT >>> - (agg, A_d, C_d, dimM, dimN); + KeMatrixColumnOp<<>>( + agg, A_d, C_d, dimM, dimN); } else { - int blocksX = (dimN + 32 -1) / 32; + int blocksX = (dimN + 32 - 1) / 32; int blocksY = 1; dim3 threads(32, 32); dim3 grid(blocksX, blocksY); - KeMatrixColumnOp_S<<< grid, threads, 0, STREAM_DEFAULT>>> - (agg, A_d, C_d, dimM, dimN); + KeMatrixColumnOp_S<<>>( + agg, A_d, C_d, dimM, dimN); } return; @@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_column_op(aggregate::sum(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_column_sum failed"); } @@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_column_op(aggregate::max(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_column_max failed"); } @@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_column_op(aggregate::min(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_column_min failed"); } @@ -226,16 +184,16 @@ template __global__ void KeVectorSum(real *E, real *Sum, int dimM) { __shared__ double sum_s[blockSize]; int tid = threadIdx.x; - int index = blockIdx.y*blockDim.x+threadIdx.x; + int index = blockIdx.y * blockDim.x + threadIdx.x; sum_s[tid] = 0.0f; while (index < dimM) { sum_s[tid] += E[index]; - index += blockDim.x*gridDim.y; + index += blockDim.x * gridDim.y; } __syncthreads(); - for (int stride = blockSize/2; stride > 0; stride = stride/2) { + for (int stride = blockSize / 2; stride > 0; stride = stride / 2) { if (tid < stride) { sum_s[tid] += sum_s[tid + stride]; } @@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) { dim3 threads(blockSize, 1); dim3 grid(blocksX, blocksY); - struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; + struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; hl_event_t hl_event = &hl_event_st; - while (!hl_cuda_event_is_ready(hl_event)) {} + while (!hl_cuda_event_is_ready(hl_event)) { + } - KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> - (A_d, t_resource.gpu_mem, dimM); - KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> - (t_resource.gpu_mem, t_resource.cpu_mem, 128); + KeVectorSum<128><<>>( + A_d, t_resource.gpu_mem, dimM); + KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>( + t_resource.gpu_mem, t_resource.cpu_mem, 128); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_synchronize(HPPL_STREAM_DEFAULT); cudaError_t err = (cudaError_t)hl_get_device_last_error(); - CHECK_EQ(cudaSuccess, err) - << "CUDA error: " << hl_get_device_error_string((size_t)err); + CHECK_EQ(cudaSuccess, err) << "CUDA error: " + << hl_get_device_error_string((size_t)err); } template __global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) { __shared__ double sum_s[blockSize]; int tid = threadIdx.x; - int index = blockIdx.y*blockDim.x+threadIdx.x; + int index = blockIdx.y * blockDim.x + threadIdx.x; sum_s[tid] = 0.0f; while (index < dimM) { sum_s[tid] += abs(E[index]); - index += blockDim.x*gridDim.y; + index += blockDim.x * gridDim.y; } __syncthreads(); - for (int stride = blockSize/2; stride > 0; stride = stride/2) { + for (int stride = blockSize / 2; stride > 0; stride = stride / 2) { if (tid < stride) { sum_s[tid] += sum_s[tid + stride]; } @@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) { dim3 threads(blockSize, 1); dim3 grid(blocksX, blocksY); - struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; + struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; hl_event_t hl_event = &hl_event_st; - while (!hl_cuda_event_is_ready(hl_event)) {} + while (!hl_cuda_event_is_ready(hl_event)) { + } - KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> - (A_d, t_resource.gpu_mem, dimM); - KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> - (t_resource.gpu_mem, t_resource.cpu_mem, 128); + KeVectorAbsSum<128><<>>( + A_d, t_resource.gpu_mem, dimM); + KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>( + t_resource.gpu_mem, t_resource.cpu_mem, 128); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_synchronize(HPPL_STREAM_DEFAULT); cudaError_t err = (cudaError_t)hl_get_device_last_error(); - CHECK_EQ(cudaSuccess, err) - << "CUDA error: " << hl_get_device_error_string((size_t)err); + CHECK_EQ(cudaSuccess, err) << "CUDA error: " + << hl_get_device_error_string((size_t)err); } diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu index b6e3e63a4f52261e49467bd82fdabd063e81460e..aac19b1ea566ad69f1f7374e393676c8debd9883 100644 --- a/paddle/cuda/src/hl_cuda_cnn.cu +++ b/paddle/cuda/src/hl_cuda_cnn.cu @@ -12,21 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include "hl_base.h" #include "hl_cnn.h" #include "hl_device_functions.cuh" -__global__ void KeMaxPoolForward(const int nthreads, const real* inputData, - const int channels, const int height, +__global__ void KeMaxPoolForward(const int nthreads, + const real* inputData, + const int channels, + const int height, const int width, - const int pooledH, const int pooledW, - const int ksizeW, const int ksizeH, - const int strideH, const int strideW, - const int offsetH, const int offsetW, - real* tgtData, const int tgtStride) { - int index = blockIdx.x * blockDim.x + threadIdx.x; + const int pooledH, + const int pooledW, + const int ksizeW, + const int ksizeH, + const int strideH, + const int strideW, + const int offsetH, + const int offsetW, + real* tgtData, + const int tgtStride) { + int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { int pw = index % pooledW; int ph = (index / pooledW) % pooledH; @@ -46,44 +52,70 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData, maxval = inputData[h * width + w]; } } - int tgtIndex = index % (pooledW * pooledH * channels) + - frameNum * tgtStride; + int tgtIndex = + index % (pooledW * pooledH * channels) + frameNum * tgtStride; tgtData[tgtIndex] = maxval; } } -void hl_maxpool_forward(const int frameCnt, const real* inputData, +void hl_maxpool_forward(const int frameCnt, + const real* inputData, const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real* tgtData, const int tgtStride) { - + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real* tgtData, + const int tgtStride) { int num_kernels = pooledH * pooledW * channels * frameCnt; int blocks = (num_kernels + 1024 - 1) / 1024; dim3 threads(1024, 1); dim3 grid(blocks, 1); - KeMaxPoolForward<<< grid, threads, 0, STREAM_DEFAULT >>> - (num_kernels, inputData, channels, height, width, - pooledH, pooledW, sizeX, sizeY, strideH, strideW, - paddingH, paddingW, tgtData, tgtStride); + KeMaxPoolForward<<>>(num_kernels, + inputData, + channels, + height, + width, + pooledH, + pooledW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + tgtData, + tgtStride); CHECK_SYNC("hl_maxpool_forward failed"); } -__global__ void KeMaxPoolBackward(const int nthreads, const real* inputData, - const real* outData, const real* outGrad, - const int channels, const int height, +__global__ void KeMaxPoolBackward(const int nthreads, + const real* inputData, + const real* outData, + const real* outGrad, + const int channels, + const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int padH, const int padW, - real scaleA, real scaleB, - real* targetGrad, const int outStride) { - int index = blockIdx.x * blockDim.x + threadIdx.x; + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int padH, + const int padW, + real scaleA, + real scaleB, + real* targetGrad, + const int outStride) { + int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { // find out the local index // find out the local offset @@ -107,43 +139,69 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData, } } } - targetGrad[index] = - scaleB * targetGrad[index] + scaleA * gradient; + targetGrad[index] = scaleB * targetGrad[index] + scaleA * gradient; } } -void hl_maxpool_backward(const int frameCnt, const real* inputData, - const real* outData, const real* outGrad, - const int channels, const int height, - const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real scaleA, real scaleB, - real* targetGrad, const int outStride) { - +void hl_maxpool_backward(const int frameCnt, + const real* inputData, + const real* outData, + const real* outGrad, + const int channels, + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real scaleA, + real scaleB, + real* targetGrad, + const int outStride) { int num_kernels = height * width * channels * frameCnt; int blocks = (num_kernels + 1024 - 1) / 1024; - KeMaxPoolBackward<<< blocks, 1024, 0, STREAM_DEFAULT >>> - (num_kernels, inputData, outData, outGrad, channels, - height, width, pooledH, pooledW, sizeX, sizeY, - strideH, strideW, - paddingH, paddingW, - scaleA, scaleB, - targetGrad, outStride); + KeMaxPoolBackward<<>>(num_kernels, + inputData, + outData, + outGrad, + channels, + height, + width, + pooledH, + pooledW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + scaleA, + scaleB, + targetGrad, + outStride); CHECK_SYNC("hl_maxpool_backward"); } -__global__ void KeAvgPoolForward(const int nthreads, const real* inputData, +__global__ void KeAvgPoolForward(const int nthreads, + const real* inputData, const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int padH, const int padW, - real* tgtData, const int tgtStride) { + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int padH, + const int padW, + real* tgtData, + const int tgtStride) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { int pw = index % pooledW; @@ -168,39 +226,64 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData, aveval += inputData[h * width + w]; } } - int tgtIndex = index % (pooledW * pooledH * channels) + - frameNum * tgtStride; + int tgtIndex = + index % (pooledW * pooledH * channels) + frameNum * tgtStride; tgtData[tgtIndex] = aveval / pool_size; } } -void hl_avgpool_forward(const int frameCnt, const real* inputData, +void hl_avgpool_forward(const int frameCnt, + const real* inputData, const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real* tgtData, const int tgtStride) { + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real* tgtData, + const int tgtStride) { int num_kernels = pooledH * pooledW * channels * frameCnt; int blocks = (num_kernels + 1024 - 1) / 1024; - KeAvgPoolForward<<< blocks, 1024, 0, STREAM_DEFAULT >>> - (num_kernels, inputData, channels, - height, width, pooledH, pooledW, - sizeX, sizeY, strideH, strideW, - paddingH, paddingW, tgtData, tgtStride); + KeAvgPoolForward<<>>(num_kernels, + inputData, + channels, + height, + width, + pooledH, + pooledW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + tgtData, + tgtStride); CHECK_SYNC("hl_avgpool_forward failed"); } -__global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad, - const int channels, const int height, +__global__ void KeAvgPoolBackward(const int nthreads, + const real* outGrad, + const int channels, + const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int padH, const int padW, - real scaleA, real scaleB, - real* tgtGrad, const int outStride) { + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int padH, + const int padW, + real scaleA, + real scaleB, + real* tgtGrad, + const int outStride) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { int offsetW = index % width + padW; @@ -215,7 +298,6 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad, real gradient = 0; outGrad += (frameNum * outStride + offsetC * pooledH * pooledW); - for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { // figure out the pooling size @@ -224,32 +306,50 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad, int hend = min(hstart + sizeY, height + padH); int wend = min(wstart + sizeX, width + padW); int poolsize = (hend - hstart) * (wend - wstart); - gradient += outGrad[ph * pooledW + pw]/poolsize; + gradient += outGrad[ph * pooledW + pw] / poolsize; } } tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient; } } -void hl_avgpool_backward(const int frameCnt, const real* outGrad, +void hl_avgpool_backward(const int frameCnt, + const real* outGrad, const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real scaleA, real scaleB, - real* backGrad, const int outStride) { + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real scaleA, + real scaleB, + real* backGrad, + const int outStride) { int num_kernels = height * width * channels * frameCnt; int blocks = (num_kernels + 1024 - 1) / 1024; - KeAvgPoolBackward <<< blocks, 1024, 0, STREAM_DEFAULT >>> - (num_kernels, outGrad, channels, height, width, - pooledH, pooledW, sizeX, sizeY, - strideH, strideW, - paddingH, paddingW, - scaleA, scaleB, - backGrad, outStride); + KeAvgPoolBackward<<>>(num_kernels, + outGrad, + channels, + height, + width, + pooledH, + pooledW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + scaleA, + scaleB, + backGrad, + outStride); CHECK_SYNC("hl_avgpool_backward failed"); } @@ -266,7 +366,7 @@ __global__ void KeBilinearInterpFw(const real* in, const size_t numChannels, const real ratioH, const real ratioW) { - int nthreads = outputH * outputW; + int nthreads = outputH * outputW; int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < nthreads) { int outIdH = tid / outputW; @@ -287,13 +387,14 @@ __global__ void KeBilinearInterpFw(const real* in, real w1lambda = ratioW * outImgIdx - inImgIdx; real w2lambda = 1.f - w1lambda; - const real* inPos = - &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx]; + const real* inPos = &in[outIdH * inputW + channelId * inImgSize + + inImgIdy * inImgW + inImgIdx]; // bilinear interpolation out[outIdH * outputW + outIdW] = - h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) + - h1lambda * (w2lambda * inPos[hId * inImgW] + w1lambda * inPos[hId * inImgW + wId]); + h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) + + h1lambda * (w2lambda * inPos[hId * inImgW] + + w1lambda * inPos[hId * inImgW + wId]); } } @@ -313,9 +414,19 @@ void hl_bilinear_forward(const real* inData, int threadNum = outputH * outputW; int blocks = (threadNum + 1024 - 1) / 1024; - KeBilinearInterpFw<<< blocks, 1024, 0, STREAM_DEFAULT>>>( - inData, inImgH, inImgW, inputH, inputW, outData, outImgH, - outImgW, outputH, outputW, numChannels, ratioH, ratioW); + KeBilinearInterpFw<<>>(inData, + inImgH, + inImgW, + inputH, + inputW, + outData, + outImgH, + outImgW, + outputH, + outputW, + numChannels, + ratioH, + ratioW); CHECK_SYNC("hl_bilinear_forward failed"); } @@ -353,13 +464,15 @@ __global__ void KeBilinearInterpBw(real* in, real w1lambda = ratioW * outImgIdx - inImgIdx; real w2lambda = 1.f - w1lambda; - real* inPos = - &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx]; + real* inPos = &in[outIdH * inputW + channelId * inImgSize + + inImgIdy * inImgW + inImgIdx]; const real* outPos = &out[outIdH * outputW + outIdW]; paddle::paddleAtomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]); paddle::paddleAtomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]); - paddle::paddleAtomicAdd(&inPos[hId * inImgW], h1lambda * w2lambda * outPos[0]); - paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId], h1lambda * w1lambda * outPos[0]); + paddle::paddleAtomicAdd(&inPos[hId * inImgW], + h1lambda * w2lambda * outPos[0]); + paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId], + h1lambda * w1lambda * outPos[0]); } } @@ -379,22 +492,37 @@ void hl_bilinear_backward(real* inGrad, int threadNum = outputH * outputW; int blocks = (threadNum + 1024 - 1) / 1024; - KeBilinearInterpBw<<< blocks, 1024, 0, STREAM_DEFAULT>>>( - inGrad, inImgH, inImgW, inputH, inputW, outGrad, outImgH, - outImgW, outputH, outputW, numChannels, ratioH, ratioW); + KeBilinearInterpBw<<>>(inGrad, + inImgH, + inImgW, + inputH, + inputW, + outGrad, + outImgH, + outImgW, + outputH, + outputW, + numChannels, + ratioH, + ratioW); CHECK_SYNC("hl_bilinear_backward failed"); } -__global__ void maxoutFpCompute(size_t nthreads, const real * inData, - real * outData, int* idData, - size_t size, size_t featLen, size_t groups) { +__global__ void maxoutFpCompute(size_t nthreads, + const real* inData, + real* outData, + int* idData, + size_t size, + size_t featLen, + size_t groups) { int index = blockIdx.x * blockDim.x + threadIdx.x; - if(index < nthreads) { + if (index < nthreads) { size_t batch_idx = index / size; size_t i = index % size; size_t channel_idx = i / featLen; size_t feat_idx = i % featLen; - size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx; + size_t data_idx = + (batch_idx * size + channel_idx * featLen) * groups + feat_idx; real max = inData[data_idx]; int maxId = 0; for (size_t g = 1; g < groups; ++g) { @@ -409,37 +537,50 @@ __global__ void maxoutFpCompute(size_t nthreads, const real * inData, } } -void hl_maxout_forward(const real* inData, real* outData, - int* idData, size_t batchSize, size_t size, - size_t featLen, size_t groups) { +void hl_maxout_forward(const real* inData, + real* outData, + int* idData, + size_t batchSize, + size_t size, + size_t featLen, + size_t groups) { int num_kernels = size * batchSize; int blocks = (num_kernels + 1024 - 1) / 1024; - maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>( - num_kernels, inData, outData, idData, size, featLen, groups); + maxoutFpCompute<<>>( + num_kernels, inData, outData, idData, size, featLen, groups); CHECK_SYNC("hl_maxout_forward failed"); } -__global__ void maxoutBpCompute(size_t nthreads, real* inGrad, - const real* outGrad, const int* idData, - size_t size, size_t featLen, size_t groups) { +__global__ void maxoutBpCompute(size_t nthreads, + real* inGrad, + const real* outGrad, + const int* idData, + size_t size, + size_t featLen, + size_t groups) { int index = blockIdx.x * blockDim.x + threadIdx.x; - if(index < nthreads) { + if (index < nthreads) { size_t batch_idx = index / size; size_t i = index % size; size_t channel_idx = i / featLen; size_t feat_idx = i % featLen; size_t newIndex = batch_idx * size; - size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx; + size_t gradIdx = + (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx; (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i]; } } -void hl_maxout_backward(real* inGrad, const real* outGrad, - const int* idData, size_t batchSize, size_t size, - size_t featLen, size_t groups) { +void hl_maxout_backward(real* inGrad, + const real* outGrad, + const int* idData, + size_t batchSize, + size_t size, + size_t featLen, + size_t groups) { int num_kernels = size * batchSize; int blocks = (num_kernels + 1024 - 1) / 1024; - maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>( - num_kernels, inGrad, outGrad, idData, size, featLen, groups); + maxoutBpCompute<<>>( + num_kernels, inGrad, outGrad, idData, size, featLen, groups); CHECK_SYNC("hl_maxout_backward failed"); } diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc index c53a5636829cab9d575f58cc2326cb3efe383e1c..7ad8a39768a064140a08c912a5a467bc24a12adf 100644 --- a/paddle/cuda/src/hl_cuda_cudnn.cc +++ b/paddle/cuda/src/hl_cuda_cudnn.cc @@ -1022,6 +1022,15 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc, real alpha = 1.0f; real beta = 1.0f; cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL; + + int batch_size = ((cudnn_tensor_descriptor)inputDesc)->batch_size; + if (batch_size > 1024 && g_cudnn_lib_version < 6000) { + LOG(INFO) << " To process current batch data with size " << batch_size + << " (>1024), cudnnBatchNorm requires cuDNN version >= 6000." + << " If there is an error complaining CUDNN_STATUS_NOT_SUPPORTED," + << " just recompile PaddlePaddle with cuDNN >= 6000, replacing" + << " current version " << g_cudnn_lib_version; + } CHECK_CUDNN( dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle, mode, diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu index b869d903ba3cfb188f823518ba8ee7d17f9b2440..a5ce81a904ebbd655a16ef68660b81d442478575 100644 --- a/paddle/cuda/src/hl_cuda_lstm.cu +++ b/paddle/cuda/src/hl_cuda_lstm.cu @@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - +#include "hl_activation_functions.h" #include "hl_base.h" #include "hl_cuda_cublas.h" #include "hl_device_functions.cuh" -#include "hl_activation_functions.h" #include "paddle/utils/Logging.h" -typedef hppl::Active::forward t_forward; +typedef hppl::Active::forward t_forward; typedef hppl::Active::backward t_backward; bool hl_lstm_sequence_parallel(int frameSize) { @@ -42,9 +41,9 @@ public: value_ += (start + length - 1) * frameSize + idx; } } - __device__ inline real *getPtr() const {return value_;} - __device__ inline real getValue() {return *value_;} - __device__ inline void setValue(real value) {*value_ = value;} + __device__ inline real *getPtr() const { return value_; } + __device__ inline real getValue() { return *value_; } + __device__ inline void setValue(real value) { *value_ = value; } template __device__ inline void nextFrame() { if (reversed == 0) { @@ -55,28 +54,25 @@ public: } }; -__device__ __forceinline__ -void ptx_sync(const int id, const int barriers) { +__device__ __forceinline__ void ptx_sync(const int id, const int barriers) { asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory"); } -__device__ __forceinline__ -void ptx_arrive(const int id, const int barriers) { +__device__ __forceinline__ void ptx_arrive(const int id, const int barriers) { asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory"); } -template -__device__ __forceinline__ real -forward_sequence(real value, - real *shValue, - real *state, - real *preOutput, - real *output, - real check, - int index, - t_forward activeNode, - t_forward activeGate, - t_forward activeState) { +template +__device__ __forceinline__ real forward_sequence(real value, + real *shValue, + real *state, + real *preOutput, + real *output, + real check, + int index, + t_forward activeNode, + t_forward activeGate, + t_forward activeState) { real out; real prevOut; real state_r; @@ -112,17 +108,20 @@ forward_sequence(real value, if (idy == 0) { ptx_sync(2, frameSize * 2); prevOut = state[idx]; - prevOut = activeState(prevOut); + prevOut = activeState(prevOut); preOutput[idx] = prevOut; ptx_arrive(3, frameSize * 2); } return value; } -#define OUTPUT_BARRIER_ID 10 -#define OUTPUT_BARRIER_ID2 11 -template +#define OUTPUT_BARRIER_ID 10 +#define OUTPUT_BARRIER_ID2 11 +template __global__ void KeLstmForward(real *gateValue, real *state, real *output, @@ -184,10 +183,16 @@ __global__ void KeLstmForward(real *gateValue, } } value = forward_sequence( - value, shValue, shState, shPrevOutput, shOutput, check, index, - hppl::gpu::forward[active_node], - hppl::gpu::forward[active_gate], - hppl::gpu::forward[active_state]); + value, + shValue, + shState, + shPrevOutput, + shOutput, + check, + index, + hppl::gpu::forward[active_node], + hppl::gpu::forward[active_gate], + hppl::gpu::forward[active_state]); const int idx = index % frameSize; const int idy = index / frameSize; if (valueSize == 128) { @@ -218,7 +223,7 @@ __global__ void KeLstmForward(real *gateValue, real B_r[frameSize]; const int computeIdx = index - valueSize; if (i == 0) { - #pragma unroll +#pragma unroll for (int n = 0; n < frameSize; n++) { B_r[n] = weight[n * valueSize + computeIdx]; } @@ -230,7 +235,7 @@ __global__ void KeLstmForward(real *gateValue, } real sum = 0.0f; for (int n = 0; n < frameSize; n++) { - sum += A_r[n]*B_r[n]; + sum += A_r[n] * B_r[n]; } shValue[computeIdx] = sum; ptx_arrive(OUTPUT_BARRIER_ID2, blockSize); @@ -239,14 +244,14 @@ __global__ void KeLstmForward(real *gateValue, if (valueSize == 256) { real B_r[frameSize]; if (i == 0) { - #pragma unroll +#pragma unroll for (int n = 0; n < frameSize; n++) { B_r[n] = weight[n * valueSize + index]; } } real sum = 0.0f; for (int n = 0; n < frameSize; n++) { - sum += shOutput[n]*B_r[n]; + sum += shOutput[n] * B_r[n]; } value += sum; } @@ -273,50 +278,81 @@ void hl_lstm_parallel_forward(real *gateValue, dim3 grid(numSequences, 1); if (!reversed) { if (frameSize == 32) { - KeLstmForward<128, 32, 0, 128, 256> - <<>> - (gateValue, stateValue, outputValue, preOutputValue, - checkIg, checkFg, checkOg, weight, sequence, - active_node, active_gate, active_state); + KeLstmForward<128, 32, 0, 128, 256><<>>( + gateValue, + stateValue, + outputValue, + preOutputValue, + checkIg, + checkFg, + checkOg, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 64) { - KeLstmForward<256, 64, 0, 256, 256> - <<>> - (gateValue, stateValue, outputValue, preOutputValue, - checkIg, checkFg, checkOg, weight, sequence, - active_node, active_gate, active_state); + KeLstmForward<256, 64, 0, 256, 256><<>>( + gateValue, + stateValue, + outputValue, + preOutputValue, + checkIg, + checkFg, + checkOg, + weight, + sequence, + active_node, + active_gate, + active_state); } } else { if (frameSize == 32) { - KeLstmForward<128, 32, 1, 128, 256> - <<>> - (gateValue, stateValue, outputValue, preOutputValue, - checkIg, checkFg, checkOg, weight, sequence, - active_node, active_gate, active_state); + KeLstmForward<128, 32, 1, 128, 256><<>>( + gateValue, + stateValue, + outputValue, + preOutputValue, + checkIg, + checkFg, + checkOg, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 64) { - KeLstmForward<256, 64, 1, 256, 256> - <<>> - (gateValue, stateValue, outputValue, preOutputValue, - checkIg, checkFg, checkOg, weight, sequence, - active_node, active_gate, active_state); + KeLstmForward<256, 64, 1, 256, 256><<>>( + gateValue, + stateValue, + outputValue, + preOutputValue, + checkIg, + checkFg, + checkOg, + weight, + sequence, + active_node, + active_gate, + active_state); } } CHECK_SYNC("hl_lstm_parallel_forward failed"); } -__device__ __forceinline__ -void transpose_32x32(real a[], const int idx) { +__device__ __forceinline__ void transpose_32x32(real a[], const int idx) { int addr = idx % 32; - #pragma unroll +#pragma unroll for (int k = 1; k < 32; k++) { // rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32); addr = __shfl(addr, (idx + 1) % 32, 32); a[k] = __shfl(a[k], addr, 32); } - #pragma unroll +#pragma unroll for (int tid = 0; tid < 31; tid++) { real tmp = (idx > tid) ? a[0] : a[1]; - #pragma unroll +#pragma unroll for (int k = 31; k > 0; k--) { a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32]; } @@ -324,29 +360,28 @@ void transpose_32x32(real a[], const int idx) { } addr = (32 - idx) % 32; - #pragma unroll +#pragma unroll for (int k = 0; k < 32; k++) { a[k] = __shfl(a[k], addr, 32); addr = __shfl(addr, (idx + 31) % 32, 32); } } -template -__device__ void -backward_sequence(real rGateValue, - real rOutputGrad, - real rPreOutputValue, - real &rGateGrad, - real &rStateGrad, - real *shStateGrad, - real *shStateValue, - real *shGateValue, - real rCheck, - real &rGateValuePrev, - int index, - t_backward activeNode, - t_backward activeGate, - t_backward activeState) { +template +__device__ void backward_sequence(real rGateValue, + real rOutputGrad, + real rPreOutputValue, + real &rGateGrad, + real &rStateGrad, + real *shStateGrad, + real *shStateValue, + real *shGateValue, + real rCheck, + real &rGateValuePrev, + int index, + t_backward activeNode, + t_backward activeGate, + t_backward activeState) { const int frameIdx = index % frameSize; const int frameIdy = index / frameSize; if (frameIdy == 3) { @@ -363,8 +398,8 @@ backward_sequence(real rGateValue, rStateGrad = rGateGrad * rCheck; shStateGrad[index] = rStateGrad; ptx_sync(3, valueSize); - rStateGrad += shStateGrad[frameIdx + frameSize *2]; - rStateGrad += shStateGrad[frameIdx + frameSize *3]; + rStateGrad += shStateGrad[frameIdx + frameSize * 2]; + rStateGrad += shStateGrad[frameIdx + frameSize * 3]; rGateGrad = rStateGrad * shGateValue[frameIdx]; rGateGrad = activeGate(rGateGrad, rGateValue); } else if (frameIdy == 2) { @@ -373,7 +408,7 @@ backward_sequence(real rGateValue, shStateGrad[index] = rStateGrad; ptx_sync(3, valueSize); rStateGrad += shStateGrad[frameIdx + frameSize]; - rStateGrad += shStateGrad[frameIdx + frameSize *3]; + rStateGrad += shStateGrad[frameIdx + frameSize * 3]; rGateValuePrev = rGateValue; rGateGrad = rStateGrad * shStateValue[frameIdx]; rGateGrad = activeGate(rGateGrad, rGateValue); @@ -381,43 +416,43 @@ backward_sequence(real rGateValue, shGateValue[frameIdx] = rGateValue; ptx_sync(3, valueSize); rStateGrad = shStateGrad[frameIdx + frameSize]; - rStateGrad += shStateGrad[frameIdx + frameSize *2]; - rStateGrad += shStateGrad[frameIdx + frameSize *3]; + rStateGrad += shStateGrad[frameIdx + frameSize * 2]; + rStateGrad += shStateGrad[frameIdx + frameSize * 3]; rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize]; rGateGrad = activeNode(rGateGrad, rGateValue); } } -template +template __device__ void load_weight(real rWeight[], real *weight, const int index) { if (valueSize == 128) { weight += index; - #pragma unroll +#pragma unroll for (int n = 0; n < frameSize; n++) { - rWeight[n] = weight[n*valueSize]; + rWeight[n] = weight[n * valueSize]; } transpose_32x32(rWeight, index % 32); } if (valueSize == 256) { int id = (index / 32) % 2; weight += index - id * 32 + id * 32 * valueSize; - #pragma unroll +#pragma unroll for (int n = 0; n < 32; n++) { - rWeight[n] = weight[n*valueSize]; - rWeight[n + 32] = weight[n*valueSize + 32]; + rWeight[n] = weight[n * valueSize]; + rWeight[n + 32] = weight[n * valueSize + 32]; } transpose_32x32(rWeight, index % 32); transpose_32x32(&rWeight[32], index % 32); } } -template +template __global__ void KeLstmBackward(real *gateValue, real *gateGrad, real *stateValue, - real *stateGrad, /* do not need save */ + real *stateGrad, /* do not need save */ real *preOutputValue, - real *preOutputGrad, /* do not need save */ + real *preOutputGrad, /* do not need save */ real *checkIg, real *checkIgGrad, real *checkFg, @@ -484,20 +519,27 @@ __global__ void KeLstmBackward(real *gateValue, for (int i = 0; i < length; ++i) { if (frameIdy == 3) { - if (i != length -1) { + if (i != length - 1) { frameStateValue.nextFrame(); shStateValue[frameIdx] = frameStateValue.getValue(); } else { shStateValue[frameIdx] = 0.0; } } - backward_sequence( - rGateValue, rOutputGrad, rPreOutputValue, rGateGrad, - rStateGrad, shStateGrad, shStateValue, shGateValue, - rCheck, rGateValuePrev, index, - hppl::gpu::backward[active_node], - hppl::gpu::backward[active_gate], - hppl::gpu::backward[active_state]); + backward_sequence(rGateValue, + rOutputGrad, + rPreOutputValue, + rGateGrad, + rStateGrad, + shStateGrad, + shStateValue, + shGateValue, + rCheck, + rGateValuePrev, + index, + hppl::gpu::backward[active_node], + hppl::gpu::backward[active_gate], + hppl::gpu::backward[active_state]); if (frameIdy == 3) { rCheckGrad += rGateGrad * rStateValue; rStateValue = shStateValue[frameIdx]; @@ -523,9 +565,9 @@ __global__ void KeLstmBackward(real *gateValue, shGateGrad[frameIdy][frameIdx] = rGateGrad; if (valueSize == 128) { real sum = 0.0f; - #pragma unroll +#pragma unroll for (int n = 0; n < frameSize; n++) { - sum += shGateGrad[frameIdy][n]*B_r[n]; + sum += shGateGrad[frameIdy][n] * B_r[n]; } if (frameIdy == 3) { rOutputGrad += sum; @@ -541,7 +583,7 @@ __global__ void KeLstmBackward(real *gateValue, } real sum = 0.0f; for (int n = 0; n < frameSize; n++) { - sum += A_r[n]*B_r[n]; + sum += A_r[n] * B_r[n]; } if (frameIdy == 3) { rOutputGrad += sum; @@ -552,8 +594,8 @@ __global__ void KeLstmBackward(real *gateValue, if (frameIdy == 3) { ptx_sync(6, valueSize); - #pragma unroll - for (int i = 0; i < 3; i ++) { +#pragma unroll + for (int i = 0; i < 3; i++) { rOutputGrad += shOutputGrad[i][frameIdx]; } } else { @@ -564,11 +606,14 @@ __global__ void KeLstmBackward(real *gateValue, /* TODO: Temporary save & merger in another kernel */ if (frameIdy == 1) { - if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad); + if (checkIgGrad) + paddle::paddleAtomicAdd(checkIgGrad + frameIdx, rCheckGrad); } else if (frameIdy == 2) { - if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad); + if (checkFgGrad) + paddle::paddleAtomicAdd(checkFgGrad + frameIdx, rCheckGrad); } else if (frameIdy == 3) { - if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad); + if (checkOgGrad) + paddle::paddleAtomicAdd(checkOgGrad + frameIdx, rCheckGrad); } } @@ -593,68 +638,183 @@ void hl_lstm_parallel_backward_data(real *gateValue, hl_activation_mode_t active_node, hl_activation_mode_t active_gate, hl_activation_mode_t active_state) { - CHECK(frameSize == 32 || frameSize == 64 || - frameSize == 128 || frameSize == 256); + CHECK(frameSize == 32 || frameSize == 64 || frameSize == 128 || + frameSize == 256); dim3 grid(numSequences, 1); if (!reversed) { if (frameSize == 32) { - KeLstmBackward<128, 32, 0><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<128, 32, 0><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 64) { - KeLstmBackward<256, 64, 0><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<256, 64, 0><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 128) { - KeLstmBackward<512, 128, 0><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<512, 128, 0><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 256) { - KeLstmBackward<1024, 256, 0><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<1024, 256, 0><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } } else { if (frameSize == 32) { - KeLstmBackward<128, 32, 1><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<128, 32, 1><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 64) { - KeLstmBackward<256, 64, 1><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<256, 64, 1><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 128) { - KeLstmBackward<512, 128, 1><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<512, 128, 1><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 256) { - KeLstmBackward<1024, 256, 1><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<1024, 256, 1><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } } CHECK_SYNC("hl_lstm_parallel_backward_data"); } -template +template __global__ void KeSetGradZero(real *gateGrad, - const int *starts, int valueSize, int numSequences, bool reversed) { + const int *starts, + int valueSize, + int numSequences, + bool reversed) { // const int tid = threadIdx.x; const int frameIdx = blockIdx.x * B_X + threadIdx.x; @@ -682,19 +842,31 @@ void hl_lstm_parallel_backward_weight(real *weightGrad, int valueSize = 4 * frameSize; dim3 threads(32, 32); dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32); - KeSetGradZero<32, 32><<>> - (gateGrad, sequence, valueSize, numSequences, reversed); + KeSetGradZero<32, 32><<>>( + gateGrad, sequence, valueSize, numSequences, reversed); if (!reversed) { hl_matrix_mul(outputValue, - HPPL_OP_T, gateGrad + valueSize, HPPL_OP_N, weightGrad, - frameSize, valueSize, batchSize - 1, - 1.0, 1.0); + HPPL_OP_T, + gateGrad + valueSize, + HPPL_OP_N, + weightGrad, + frameSize, + valueSize, + batchSize - 1, + 1.0, + 1.0); } else { hl_matrix_mul(outputValue + frameSize, - HPPL_OP_T, gateGrad, HPPL_OP_N, weightGrad, - frameSize, valueSize, batchSize - 1, - 1.0, 1.0); + HPPL_OP_T, + gateGrad, + HPPL_OP_N, + weightGrad, + frameSize, + valueSize, + batchSize - 1, + 1.0, + 1.0); } CHECK_SYNC("hl_lstm_parallel_backward_weight"); } diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu index 9bcc7fb7de44b2211db450fb164655f7947dcad9..39272456c394adc0509e60cf5972df832f7b3424 100644 --- a/paddle/cuda/src/hl_cuda_matrix.cu +++ b/paddle/cuda/src/hl_cuda_matrix.cu @@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "hl_base.h" +#include "hl_device_functions.cuh" +#include "hl_gpu_matrix_kernel.cuh" #include "hl_matrix.h" -#include "hl_matrix_ops.cuh" #include "hl_matrix_apply.cuh" +#include "hl_matrix_ops.cuh" #include "hl_sequence.h" #include "hl_sparse.ph" #include "paddle/utils/Logging.h" -#include "hl_device_functions.cuh" -#include "hl_gpu_matrix_kernel.cuh" DEFINE_MATRIX_UNARY_OP(Zero, a = 0); -DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b); -void hl_matrix_add(real *A_d, - real *B_d, - real *C_d, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b); +void hl_matrix_add(real* A_d, + real* B_d, + real* C_d, int dimM, int dimN, real alpha, @@ -36,33 +35,32 @@ void hl_matrix_add(real *A_d, CHECK_NOTNULL(B_d); CHECK_NOTNULL(C_d); - hl_gpu_apply_ternary_op - , 0, 0>(ternary::_add(alpha, beta), - A_d, - B_d, - C_d, - dimM, - dimN, - dimN, - dimN, - dimN); + hl_gpu_apply_ternary_op, 0, 0>( + ternary::_add(alpha, beta), + A_d, + B_d, + C_d, + dimM, + dimN, + dimN, + dimN, + dimN); CHECK_SYNC("hl_matrix_add failed"); } #ifdef PADDLE_TYPE_DOUBLE - #define THRESHOLD 128 +#define THRESHOLD 128 #else - #define THRESHOLD 64 +#define THRESHOLD 64 #endif -__device__ __forceinline__ -void findMax(real* I, - real* dfMax_s, - int blockSize, - int base, - int curIdx, - int nextIdx, - int dimN, - real* max) { +__device__ __forceinline__ void findMax(real* I, + real* dfMax_s, + int blockSize, + int base, + int curIdx, + int nextIdx, + int dimN, + real* max) { dfMax_s[base] = -1.0e20; while (curIdx < dimN) { if (dfMax_s[base] < I[nextIdx]) { @@ -78,25 +76,24 @@ void findMax(real* I, if (base < stride) { nextIdx = base + stride; if (dfMax_s[base] < dfMax_s[nextIdx]) { - dfMax_s[base] = dfMax_s[nextIdx]; + dfMax_s[base] = dfMax_s[nextIdx]; } } } - if (0 == base) { + if (0 == base) { max[0] = dfMax_s[0]; } __syncthreads(); } -__device__ __forceinline__ -void subMaxAndExp(real* I, - real* O, - int curIdx, - int nextIdx, - int blockSize, - int dimN, - real max) { +__device__ __forceinline__ void subMaxAndExp(real* I, + real* O, + int curIdx, + int nextIdx, + int blockSize, + int dimN, + real max) { real val; while (curIdx < dimN) { val = I[nextIdx] - max; @@ -115,14 +112,13 @@ void subMaxAndExp(real* I, __syncthreads(); } -__device__ __forceinline__ -void valueSum(real* O, - real* dfMax_s, - int blockSize, - int base, - int curIdx, - int nextIdx, - int dimN) { +__device__ __forceinline__ void valueSum(real* O, + real* dfMax_s, + int blockSize, + int base, + int curIdx, + int nextIdx, + int dimN) { dfMax_s[base] = 0; while (curIdx < dimN) { dfMax_s[base] += O[nextIdx]; @@ -141,13 +137,8 @@ void valueSum(real* O, __syncthreads(); } -__device__ __forceinline__ -void divSum(real* O, - real sum, - int curIdx, - int nextIdx, - int blockSize, - int dimN) { +__device__ __forceinline__ void divSum( + real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) { while (curIdx < dimN) { O[nextIdx] /= sum; nextIdx += blockSize; @@ -155,20 +146,18 @@ void divSum(real* O, } } -__device__ __forceinline__ -void softmax(real* I, - real* O, - real* dfMax_s, - int blockSize, - int base, - int curIdx, - int nextIdx, - int dimN) { +__device__ __forceinline__ void softmax(real* I, + real* O, + real* dfMax_s, + int blockSize, + int base, + int curIdx, + int nextIdx, + int dimN) { __shared__ real max; // find the max number - findMax(I, dfMax_s, blockSize, base, curIdx, - nextIdx, dimN, &max); + findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max); // sub max Value and do Exp operation subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max); @@ -181,8 +170,8 @@ void softmax(real* I, divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN); } -template -__global__ void KeMatrixSoftMax(real *O, real *I, int dimN) { +template +__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) { int base = threadIdx.x; __shared__ real dfMax_s[blockSize]; int nextIdx = blockIdx.x * dimN + base; @@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) { softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN); } -void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) { +void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); dim3 block(512, 1); dim3 grid(dimM, 1); - KeMatrixSoftMax<512> - <<>>(C_d, A_d, dimN); + KeMatrixSoftMax<512><<>>(C_d, A_d, dimN); CHECK_SYNC("hl_matrix_softmax failed"); } -template -__global__ void KeSequenceSoftMax(real *O, real *I, const int* index) { +template +__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) { int base = threadIdx.x; int bid = blockIdx.x; __shared__ real dfMax_s[blockSize]; @@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) { softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN); } -void hl_sequence_softmax_forward(real *A_d, - real *C_d, +void hl_sequence_softmax_forward(real* A_d, + real* C_d, const int* index, int numSequence) { CHECK_NOTNULL(A_d); @@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d, dim3 block(512, 1); dim3 grid(numSequence, 1); - KeSequenceSoftMax<512> - <<>>(C_d, A_d, index); + KeSequenceSoftMax<512><<>>(C_d, A_d, index); CHECK_SYNC("hl_sequence_softmax_forward failed"); } -__global__ void KeMatrixDerivative(real *grad_d, - real *output_d, - real *sftmaxSum_d, - int dimM, - int dimN) { - int rowIdx = blockIdx.x*blockDim.x + threadIdx.x; - int colIdx = blockIdx.y*blockDim.y + threadIdx.y; +__global__ void KeMatrixDerivative( + real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) { + int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; + int colIdx = blockIdx.y * blockDim.y + threadIdx.y; int index; if (rowIdx < dimM && colIdx < dimN) { - index = rowIdx*dimN + colIdx; + index = rowIdx * dimN + colIdx; grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]); } } -void hl_matrix_softmax_derivative(real *grad_d, - real *output_d, - real *sftmaxSum_d, - int dimM, - int dimN) { +void hl_matrix_softmax_derivative( + real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) { CHECK_NOTNULL(grad_d); CHECK_NOTNULL(output_d); CHECK_NOTNULL(sftmaxSum_d); int blocksX = (dimM + 0) / 1; - int blocksY = (dimN + 1024 -1) / 1024; + int blocksY = (dimN + 1024 - 1) / 1024; dim3 threads(1, 1024); dim3 grid(blocksX, blocksY); - KeMatrixDerivative<<< grid, threads, 0, STREAM_DEFAULT >>> - (grad_d, output_d, sftmaxSum_d, dimM, dimN); + KeMatrixDerivative<<>>( + grad_d, output_d, sftmaxSum_d, dimM, dimN); CHECK_SYNC("hl_matrix_softmax_derivative failed"); } -__global__ void KeMatrixMultiBinaryCrossEntropy(real* output, - real* entropy, - int* row, - int* col, - int dimM, - int dimN) { +__global__ void KeMatrixMultiBinaryCrossEntropy( + real* output, real* entropy, int* row, int* col, int dimM, int dimN) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < dimM) { - for (int i = 0; i < dimN; i ++) { + for (int i = 0; i < dimN; i++) { entropy[index] -= log(1 - output[index * dimN + i]); } - int *row_col = col + row[index]; + int* row_col = col + row[index]; int col_num = row[index + 1] - row[index]; - for (int i = 0; i < col_num; i ++) { + for (int i = 0; i < col_num; i++) { real o = output[index * dimN + row_col[i]]; entropy[index] -= log(o / (1 - o)); } @@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output, dim3 threads(n_threads); dim3 grid(blocks); hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix); - KeMatrixMultiBinaryCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>> - (output, entropy, mat->csr_row, mat->csr_col, dimM, dimN); + KeMatrixMultiBinaryCrossEntropy<<>>( + output, entropy, mat->csr_row, mat->csr_col, dimM, dimN); CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed"); } -__global__ void KeMatrixMultiBinaryCrossEntropyBp(real* output, - real* grad, - int* row, - int* col, - int dimM, - int dimN) { +__global__ void KeMatrixMultiBinaryCrossEntropyBp( + real* output, real* grad, int* row, int* col, int dimM, int dimN) { int row_idx = blockIdx.x * blockDim.x + threadIdx.x; if (row_idx < dimM) { - for (int i = 0; i < dimN; i ++) { + for (int i = 0; i < dimN; i++) { int index = row_idx * dimN + i; grad[index] += 1.0 / (1 - output[index]); } int col_num = row[row_idx + 1] - row[row_idx]; - int *row_col = col + row[row_idx]; - for (int i = 0; i < col_num; i ++) { + int* row_col = col + row[row_idx]; + for (int i = 0; i < col_num; i++) { int index = row_idx * dimN + row_col[i]; grad[index] -= 1.0 / (output[index] * (1 - output[index])); } } } -void hl_matrix_multi_binary_cross_entropy_bp(real* output, - real* grad, - hl_sparse_matrix_s csr_mat, - int dimM, - int dimN) { +void hl_matrix_multi_binary_cross_entropy_bp( + real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) { CHECK_NOTNULL(output); CHECK_NOTNULL(grad); CHECK_NOTNULL(csr_mat); @@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output, dim3 threads(n_threads); dim3 grid(blocks); hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix); - KeMatrixMultiBinaryCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>> - (output, grad, mat->csr_row, mat->csr_col, dimM, dimN); + KeMatrixMultiBinaryCrossEntropyBp<<>>( + output, grad, mat->csr_row, mat->csr_col, dimM, dimN); CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed"); } -__global__ void KeMatrixCrossEntropy(real* O, - real* E, - int* label, - int dimM, - int dimN) { +__global__ void KeMatrixCrossEntropy( + real* O, real* E, int* label, int dimM, int dimN) { int index = blockIdx.x * blockDim.x + threadIdx.x; int newBase; if (index < dimM) { @@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O, } } -void hl_matrix_cross_entropy(real* A_d, - real* C_d, - int* label_d, - int dimM, - int dimN) { +void hl_matrix_cross_entropy( + real* A_d, real* C_d, int* label_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); int blocks = (dimM + 1024 - 1) / 1024; dim3 threads(1024, 1); dim3 grid(blocks, 1); - KeMatrixCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>> - (A_d, C_d, label_d, dimM, dimN); + KeMatrixCrossEntropy<<>>( + A_d, C_d, label_d, dimM, dimN); CHECK_SYNC("hl_matrix_cross_entropy failed"); } -__global__ void KeMatrixCrossEntropyBp(real* grad_d, - real* output_d, - int* label_d, - int dimM, - int dimN) { - int rowIdx = blockIdx.x*blockDim.x + threadIdx.x; - int colIdx = blockIdx.y*blockDim.y + threadIdx.y; +__global__ void KeMatrixCrossEntropyBp( + real* grad_d, real* output_d, int* label_d, int dimM, int dimN) { + int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; + int colIdx = blockIdx.y * blockDim.y + threadIdx.y; int index; if (rowIdx < dimM && colIdx < dimN) { - index = rowIdx*dimN + colIdx; + index = rowIdx * dimN + colIdx; if (label_d[rowIdx] == colIdx) { grad_d[index] -= 1.0f / output_d[index]; } } } -void hl_matrix_cross_entropy_bp(real* grad_d, - real* output_d, - int* label_d, - int dimM, - int dimN) { +void hl_matrix_cross_entropy_bp( + real* grad_d, real* output_d, int* label_d, int dimM, int dimN) { CHECK_NOTNULL(grad_d); CHECK_NOTNULL(output_d); CHECK_NOTNULL(label_d); - int blocksX = (dimM + 0)/1; - int blocksY = (dimN + 1024 -1) / 1024; + int blocksX = (dimM + 0) / 1; + int blocksY = (dimN + 1024 - 1) / 1024; dim3 threads(1, 1024); dim3 grid(blocksX, blocksY); - KeMatrixCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>> - (grad_d, output_d, label_d, dimM, dimN); + KeMatrixCrossEntropyBp<<>>( + grad_d, output_d, label_d, dimM, dimN); CHECK_SYNC("hl_matrix_cross_entropy_bp failed"); } void hl_matrix_zero_mem(real* data, int num) { - hl_gpu_apply_unary_op( - unary::Zero(), data, 1, num, num); + hl_gpu_apply_unary_op(unary::Zero(), data, 1, num, num); } __global__ void KeParamReluForward(real* output, @@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output, int ty = blockIdx.y * blockDim.y + threadIdx.y; if (tx < width && ty < height) { int index = ty * width + tx; - output[index] = input[index] > 0 ? input[index] : - input[index] * w[tx / partial_sum]; + output[index] = + input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum]; } } @@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output, CHECK_NOTNULL(w); dim3 threads(16, 16); int blockX = (width + 16 - 1) / 16; - int blockY = (height + 16 -1) / 16; + int blockY = (height + 16 - 1) / 16; dim3 grid(blockX, blockY); - KeParamReluForward<<>> - (output, input, w, width, height, partial_sum); + KeParamReluForward<<>>( + output, input, w, width, height, partial_sum); CHECK_SYNC("hl_param_relu_forward failed"); } -template +template __global__ void KeParamReluBackWardW(real* grad_w, real* grad_o, real* input, @@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w, int grid_num = width / partial_sum; dim3 threads(blockSize, 1); dim3 grid(grid_num, 1); - KeParamReluBackWardW<<>> - (grad_w, grad_o, input, width, height, partial_sum); + KeParamReluBackWardW<<>>( + grad_w, grad_o, input, width, height, partial_sum); CHECK_SYNC("hl_param_relu_backward_w failed"); } @@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o, CHECK_NOTNULL(diff); dim3 threads(16, 16); int blockX = (width + 16 - 1) / 16; - int blockY = (height + 16 -1) / 16; + int blockY = (height + 16 - 1) / 16; dim3 grid(blockX, blockY); - KeParamReluBackwardDiff<<>> - (grad_o, data, w, diff, width, height, partial_sum); + KeParamReluBackwardDiff<<>>( + grad_o, data, w, diff, width, height, partial_sum); CHECK_SYNC("hl_param_relu_backward_diff failed"); } -__global__ void KeMatrixAddSharedBias(real* A, - real* B, - const int channel, - const int M, - const int N, - real scale) { +__global__ void KeMatrixAddSharedBias( + real* A, real* B, const int channel, const int M, const int N, real scale) { int index = blockIdx.x * blockDim.x + threadIdx.x; int dim = N / channel; if (index < M * N) { @@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d, real scale) { const int blocks = 512; const int grids = DIVUP(dimM * dimN, blocks); - KeMatrixAddSharedBias<<>> - (A_d, B_d, channel, dimM, dimN, scale); + KeMatrixAddSharedBias<<>>( + A_d, B_d, channel, dimM, dimN, scale); CHECK_SYNC("hl_matrix_add_shared_bias failed"); } - template -__global__ void KeMatrixCollectSharedBias(real *B, - real *A, +__global__ void KeMatrixCollectSharedBias(real* B, + real* A, const int channel, const int M, const int N, @@ -589,7 +541,7 @@ __global__ void KeMatrixCollectSharedBias(real *B, int n = j * blockSize + tid; int m = n / dim; int w = n % dim; - smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0; + smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0; __syncthreads(); simpleReduce(smem, tid, blockSize); sum += smem[0]; @@ -611,33 +563,32 @@ void hl_matrix_collect_shared_bias(real* B_d, const int limit = 64; int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel; - KeMatrixCollectSharedBias - <<< grids, blocks, 0, STREAM_DEFAULT>>> - (B_d, A_d, channel, dimM, dimN, dim, limit, scale); + KeMatrixCollectSharedBias<<>>( + B_d, A_d, channel, dimM, dimN, dim, limit, scale); CHECK_SYNC("hl_matrix_collect_shared_bias failed"); } -__global__ void keMatrixRotate(real* mat, real* matRot, - int dimM, int dimN, bool clockWise) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < dimM * dimN) { - int i = idx / dimN; - int j = idx % dimN; - if (clockWise) { - matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j]; - } else { - matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)]; - } +__global__ void keMatrixRotate( + real* mat, real* matRot, int dimM, int dimN, bool clockWise) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < dimM * dimN) { + int i = idx / dimN; + int j = idx % dimN; + if (clockWise) { + matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j]; + } else { + matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)]; } + } } -void hl_matrix_rotate(real *mat, real* matRot, - int dimM, int dimN, bool clockWise) { - CHECK_NOTNULL(mat); - CHECK_NOTNULL(matRot); - const int threads = 512; - const int blocks = DIVUP(dimM * dimN, threads); - keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>> - (mat, matRot, dimM, dimN, clockWise); - CHECK_SYNC("hl_matrix_rotate failed"); +void hl_matrix_rotate( + real* mat, real* matRot, int dimM, int dimN, bool clockWise) { + CHECK_NOTNULL(mat); + CHECK_NOTNULL(matRot); + const int threads = 512; + const int blocks = DIVUP(dimM * dimN, threads); + keMatrixRotate<<>>( + mat, matRot, dimM, dimN, clockWise); + CHECK_SYNC("hl_matrix_rotate failed"); } diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu index 4f650ce03ccb2d14cc2997e9cd426acb91439539..c52780dfcaff6e5b94d3568fac4ca011b76a1442 100644 --- a/paddle/cuda/src/hl_cuda_sequence.cu +++ b/paddle/cuda/src/hl_cuda_sequence.cu @@ -16,36 +16,36 @@ limitations under the License. */ #include "hl_device_functions.cuh" #include "paddle/utils/Logging.h" -__global__ void KeMaxSequenceForward(real *input, - const int *sequence, +__global__ void KeMaxSequenceForward(real* input, + const int* sequence, real* output, - int *index, + int* index, int numSequences, int dim) { int dimIdx = threadIdx.x; int sequenceId = blockIdx.x; if (sequenceId >= numSequences) return; int start = sequence[sequenceId]; - int end = sequence[sequenceId+1]; + int end = sequence[sequenceId + 1]; for (int i = dimIdx; i < dim; i += blockDim.x) { real tmp = -HL_FLOAT_MAX; int tmpId = -1; for (int insId = start; insId < end; insId++) { - if (tmp < input[insId*dim + i]) { - tmp = input[insId*dim + i]; + if (tmp < input[insId * dim + i]) { + tmp = input[insId * dim + i]; tmpId = insId; } } - output[sequenceId*dim + i] = tmp; - index[sequenceId*dim + i] = tmpId; + output[sequenceId * dim + i] = tmp; + index[sequenceId * dim + i] = tmpId; } } void hl_max_sequence_forward(real* input, const int* sequence, real* output, - int *index, + int* index, int numSequences, int dim) { CHECK_NOTNULL(input); @@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input, dim3 threads(256, 1); dim3 grid(numSequences, 1); - KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>> - (input, sequence, output, index, numSequences, dim); + KeMaxSequenceForward<<>>( + input, sequence, output, index, numSequences, dim); CHECK_SYNC("hl_max_sequence_forward failed"); } -__global__ void KeMaxSequenceBackward(real *outputGrad, - int *index, - real* inputGrad, - int numSequences, - int dim) { +__global__ void KeMaxSequenceBackward( + real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) { int idx = threadIdx.x + blockIdx.x * blockDim.x; int colIdx = idx % dim; - if (idx < numSequences*dim) { + if (idx < numSequences * dim) { int insId = index[idx]; inputGrad[insId * dim + colIdx] += outputGrad[idx]; } } -void hl_max_sequence_backward(real* outputGrad, - int *index, - real* inputGrad, - int numSequences, - int dim) { +void hl_max_sequence_backward( + real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) { CHECK_NOTNULL(outputGrad); CHECK_NOTNULL(index); CHECK_NOTNULL(inputGrad); @@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad, unsigned int blocks = (numSequences * dim + 128 - 1) / 128; dim3 threads(128, 1); dim3 grid(blocks, 1); - KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>> - (outputGrad, index, inputGrad, numSequences, dim); + KeMaxSequenceBackward<<>>( + outputGrad, index, inputGrad, numSequences, dim); CHECK_SYNC("hl_max_sequence_backward failed"); } -template +template __global__ void KeMatrixAddRows(real* output, real* table, int* ids, @@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output, while (sampleId < numSamples) { int tableId = ids[sampleId]; if ((0 <= tableId) && (tableId < tableSize)) { - real *outputData = output + sampleId * dim; - real *tableData = table + tableId * dim; + real* outputData = output + sampleId * dim; + real* tableData = table + tableId * dim; for (int i = idx; i < dim; i += blockDimX) { if (AddRow == 0) { outputData[i] += tableData[i]; @@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output, } } } - sampleId += blockDimY*gridDimX; + sampleId += blockDimY * gridDimX; } } -template -__global__ -void KeSequence2Batch(real *batch, - real *sequence, - const int *batchIndex, - int seqWidth, - int batchCount) { +template +__global__ void KeSequence2Batch(real* batch, + real* sequence, + const int* batchIndex, + int seqWidth, + int batchCount) { int idx = threadIdx.x; int idy = threadIdx.y; int id = blockIdx.x + idy * gridDimX; while (id < batchCount) { int seqId = batchIndex[id]; - real* batchData = batch + id*seqWidth; - real* seqData = sequence + seqId*seqWidth; + real* batchData = batch + id * seqWidth; + real* seqData = sequence + seqId * seqWidth; for (int i = idx; i < seqWidth; i += blockDimX) { if (seq2batch) { if (isAdd) { @@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch, } } } - id += blockDimY*gridDimX; + id += blockDimY * gridDimX; } } -void hl_sequence2batch_copy(real *batch, - real *sequence, - const int *batchIndex, +void hl_sequence2batch_copy(real* batch, + real* sequence, + const int* batchIndex, int seqWidth, int batchCount, bool seq2batch) { @@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch, dim3 threads(128, 8); dim3 grid(8, 1); if (seq2batch) { - KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>> - (batch, sequence, batchIndex, seqWidth, batchCount); + KeSequence2Batch<128, 8, 8, 1, 0><<>>( + batch, sequence, batchIndex, seqWidth, batchCount); } else { - KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>> - (batch, sequence, batchIndex, seqWidth, batchCount); + KeSequence2Batch<128, 8, 8, 0, 0><<>>( + batch, sequence, batchIndex, seqWidth, batchCount); } CHECK_SYNC("hl_sequence2batch_copy failed"); } -void hl_sequence2batch_add(real *batch, - real *sequence, - int *batchIndex, +void hl_sequence2batch_add(real* batch, + real* sequence, + int* batchIndex, int seqWidth, int batchCount, bool seq2batch) { @@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch, dim3 threads(128, 8); dim3 grid(8, 1); if (seq2batch) { - KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>> - (batch, sequence, batchIndex, seqWidth, batchCount); + KeSequence2Batch<128, 8, 8, 1, 1><<>>( + batch, sequence, batchIndex, seqWidth, batchCount); } else { - KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>> - (batch, sequence, batchIndex, seqWidth, batchCount); + KeSequence2Batch<128, 8, 8, 0, 1><<>>( + batch, sequence, batchIndex, seqWidth, batchCount); } CHECK_SYNC("hl_sequence2batch_add failed"); } -template -__global__ -void KeSequence2BatchPadding(real* batch, - real* sequence, - const int* sequenceStartPositions, - const size_t sequenceWidth, - const size_t maxSequenceLength, - const size_t numSequences) { +template +__global__ void KeSequence2BatchPadding(real* batch, + real* sequence, + const int* sequenceStartPositions, + const size_t sequenceWidth, + const size_t maxSequenceLength, + const size_t numSequences) { int batchIdx = blockIdx.y; int sequenceStart = sequenceStartPositions[batchIdx]; int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart; @@ -269,45 +265,56 @@ void hl_sequence2batch_copy_padding(real* batch, int blockDimY = CUDA_BLOCK_SIZE / blockDimX; dim3 threads(blockDimX, blockDimY); - int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) / - CUDA_BLOCK_SIZE; + int gridDimX = (maxSequenceLength + blockDimY - 1) / blockDimY; int gridDimY = numSequences; dim3 grid(gridDimX, gridDimY); if (seq2batch) { /* sequence -> batch */ if (normByTimes) { - KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( - batch, sequence, sequenceStartPositions, - sequenceWidth, maxSequenceLength, numSequences); + KeSequence2BatchPadding<1, 1><<>>( + batch, + sequence, + sequenceStartPositions, + sequenceWidth, + maxSequenceLength, + numSequences); } else { - KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( - batch, sequence, sequenceStartPositions, - sequenceWidth, maxSequenceLength, numSequences); + KeSequence2BatchPadding<0, 1><<>>( + batch, + sequence, + sequenceStartPositions, + sequenceWidth, + maxSequenceLength, + numSequences); } } else { /* batch -> sequence */ if (normByTimes) { - KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( - batch, sequence, sequenceStartPositions, - sequenceWidth, maxSequenceLength, numSequences); + KeSequence2BatchPadding<1, 0><<>>( + batch, + sequence, + sequenceStartPositions, + sequenceWidth, + maxSequenceLength, + numSequences); } else { - KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( - batch, sequence, sequenceStartPositions, - sequenceWidth, maxSequenceLength, numSequences); + KeSequence2BatchPadding<0, 0><<>>( + batch, + sequence, + sequenceStartPositions, + sequenceWidth, + maxSequenceLength, + numSequences); } } CHECK_SYNC("hl_sequence2batch_copy_padding failed"); } -__device__ inline float my_rsqrt(float x) { - return rsqrtf(x); -} +__device__ inline float my_rsqrt(float x) { return rsqrtf(x); } -__device__ inline double my_rsqrt(double x) { - return rsqrt(x); -} +__device__ inline double my_rsqrt(double x) { return rsqrt(x); } __global__ void KeSequenceAvgForward(real* dst, real* src, @@ -328,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst, for (int i = start; i < end; i++) { sum += src[i * width + col]; } - sum = mode == 1 ? sum : - (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength)); + sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength + : sum * my_rsqrt((real)seqLength)); dst[gid] += sum; } } @@ -348,10 +355,10 @@ void hl_sequence_avg_forward(real* dst, int grid = DIVUP(width * height, 512); CHECK(mode == 0 || mode == 1 || mode == 2) - << "mode error in hl_sequence_avg_forward!"; + << "mode error in hl_sequence_avg_forward!"; - KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>> - (dst, src, starts, height, width, mode); + KeSequenceAvgForward<<>>( + dst, src, starts, height, width, mode); CHECK_SYNC("hl_sequence_avg_forward failed"); } @@ -371,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst, int seqLength = end - start; if (seqLength == 0) return; real grad = src[gid]; - grad = mode == 1 ? grad : - (mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength)); + grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength + : grad * my_rsqrt((real)seqLength)); for (int i = start; i < end; i++) { dst[i * width + col] += grad; } @@ -393,9 +400,9 @@ void hl_sequence_avg_backward(real* dst, int grid = DIVUP(width * height, 512); CHECK(mode == 0 || mode == 1 || mode == 2) - << "mode error in hl_sequence_avg_backward!"; + << "mode error in hl_sequence_avg_backward!"; - KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>> - (dst, src, starts, height, width, mode); + KeSequenceAvgBackward<<>>( + dst, src, starts, height, width, mode); CHECK_SYNC("hl_sequence_avg_backward failed"); } diff --git a/paddle/cuda/src/hl_cuda_sparse.cu b/paddle/cuda/src/hl_cuda_sparse.cu index ab9ab57c884137f117c25c2752b5603b2e8b7135..6351e7e01ee55b6303a6e48bc9ebf9834a83130e 100644 --- a/paddle/cuda/src/hl_cuda_sparse.cu +++ b/paddle/cuda/src/hl_cuda_sparse.cu @@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "hl_cuda.h" +#include "hl_cuda_sparse.cuh" +#include "hl_matrix_apply.cuh" +#include "hl_matrix_ops.cuh" #include "hl_sparse.h" #include "hl_sparse.ph" -#include "hl_matrix_ops.cuh" -#include "hl_matrix_apply.cuh" -#include "hl_cuda_sparse.cuh" #include "paddle/utils/Logging.h" DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p); @@ -34,15 +33,15 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d, CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!"; if (A_d->nnz == 0) { - hl_gpu_apply_unary_op( - unary::Zero(), C_d, dimM, dimN, dimN); + hl_gpu_apply_unary_op(unary::Zero(), C_d, dimM, dimN, dimN); return; } /* nnz != 0 */ hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix); - CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && - A_d2->csr_row && A_d2->csr_col) << "parameter transa error!"; + CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && A_d2->csr_row && + A_d2->csr_col) + << "parameter transa error!"; int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; @@ -50,21 +49,11 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d, dim3 grid(blocksX, blocksY); if (A_d->type == HL_NO_VALUE) { - KeSMatrixCsr2Dense<0> - <<>>(A_d2->csr_val, - A_d2->csr_row, - A_d2->csr_col, - C_d, - dimM, - dimN); + KeSMatrixCsr2Dense<0><<>>( + A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN); } else if (A_d->type == HL_FLOAT_VALUE) { - KeSMatrixCsr2Dense<1> - <<>>(A_d2->csr_val, - A_d2->csr_row, - A_d2->csr_col, - C_d, - dimM, - dimN); + KeSMatrixCsr2Dense<1><<>>( + A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN); } else { } CHECK_SYNC("hl_matrix_csr2dense failed"); @@ -80,15 +69,15 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d, CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!"; if (A_d->nnz == 0) { - hl_gpu_apply_unary_op( - unary::Zero(), C_d, dimM, dimN, dimN); + hl_gpu_apply_unary_op(unary::Zero(), C_d, dimM, dimN, dimN); return; } /* nnz != 0 */ hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix); - CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && - A_d2->csc_row && A_d2->csc_col) << "parameter transa error!"; + CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && A_d2->csc_row && + A_d2->csc_col) + << "parameter transa error!"; int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; @@ -96,21 +85,11 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d, dim3 grid(blocksX, blocksY); if (A_d->type == HL_NO_VALUE) { - KeSMatrixCsc2Dense<0> - <<>>(A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - C_d, - dimM, - dimN); + KeSMatrixCsc2Dense<0><<>>( + A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN); } else if (A_d->type == HL_FLOAT_VALUE) { - KeSMatrixCsc2Dense<1> - <<>>(A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - C_d, - dimM, - dimN); + KeSMatrixCsc2Dense<1><<>>( + A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN); } else { } CHECK_SYNC("hl_matrix_csc2dense failed"); @@ -118,43 +97,43 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d, void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz) { CHECK_NOTNULL(A_d); CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC) - << "sparse matrix format error!"; + << "sparse matrix format error!"; CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE) - << "sparse matrix value type error!"; + << "sparse matrix value type error!"; /* avoid malloc 0 bytes */ int nnz_s = (nnz == 0 ? 1 : nnz); if (format == HL_SPARSE_CSR) { CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csr_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix)); CHECK_NOTNULL(tmp); - hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); + hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); csr->sparsity = -1.0; if (value_type == HL_NO_VALUE) { csr->csr_val = NULL; csr->nnz_s = nnz_s; - csr->row_s = dimM+1; - csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int)); - csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int)); + csr->row_s = dimM + 1; + csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int)); + csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int)); *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csr; } else if (value_type == HL_FLOAT_VALUE) { csr->nnz_s = nnz_s; - csr->row_s = dimM+1; - csr->csr_val = (real*)hl_malloc_device((nnz_s)*sizeof(real)); - csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int)); - csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int)); + csr->row_s = dimM + 1; + csr->csr_val = (real *)hl_malloc_device((nnz_s) * sizeof(real)); + csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int)); + csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int)); *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csr; @@ -162,28 +141,28 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, } else if (format == HL_SPARSE_CSC) { CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csc_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix)); CHECK_NOTNULL(tmp); - hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); + hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); csc->sparsity = -1.0f; if (value_type == HL_NO_VALUE) { csc->csc_val = NULL; csc->nnz_s = nnz_s; - csc->col_s = dimN+1; - csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int)); - csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int)); + csc->col_s = dimN + 1; + csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int)); + csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int)); *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csc; } else if (value_type == HL_FLOAT_VALUE) { csc->nnz_s = nnz_s; - csc->col_s = dimN+1; - csc->csc_val = (real*)hl_malloc_device((nnz_s)*sizeof(real)); - csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int)); - csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int)); + csc->col_s = dimN + 1; + csc->csc_val = (real *)hl_malloc_device((nnz_s) * sizeof(real)); + csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int)); + csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int)); *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csc; @@ -200,7 +179,7 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) { CHECK_NOTNULL(A_d); CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC) - << "sparse matrix format error!"; + << "sparse matrix format error!"; if (A_d->matrix == NULL) { free(A_d); @@ -249,77 +228,77 @@ void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) { } void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, - void * dest_d, + void *dest_d, size_t size, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz) { CHECK_NOTNULL(A_d); CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC) - << "sparse matrix format error!"; + << "sparse matrix format error!"; if (format == HL_SPARSE_CSR) { CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; - size_t size_ = (dimM+1)*sizeof(int) + nnz*sizeof(int); + size_t size_ = (dimM + 1) * sizeof(int) + nnz * sizeof(int); if (value_type != HL_NO_VALUE) { - size_ += nnz*sizeof(real); + size_ += nnz * sizeof(real); } CHECK_LE(size_, size) << "dest_d size(" << size - << ") too small, should bigger than(" << size_ << ")!"; + << ") too small, should bigger than(" << size_ + << ")!"; - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csr_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix)); CHECK_NOTNULL(tmp); - hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); + hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); if (value_type == HL_NO_VALUE) { csr->csr_val = NULL; - csr->csr_row = (int*)dest_d; - csr->csr_col = (int*)((char*)dest_d + (dimM+1)*sizeof(int)); + csr->csr_row = (int *)dest_d; + csr->csr_col = (int *)((char *)dest_d + (dimM + 1) * sizeof(int)); } else { - csr->csr_val = (real*)dest_d; - csr->csr_row = (int*)((char*)dest_d + nnz*sizeof(real)); - csr->csr_col = (int*)((char*)dest_d + - nnz*sizeof(real) + - (dimM+1)*sizeof(int)); + csr->csr_val = (real *)dest_d; + csr->csr_row = (int *)((char *)dest_d + nnz * sizeof(real)); + csr->csr_col = (int *)((char *)dest_d + nnz * sizeof(real) + + (dimM + 1) * sizeof(int)); } csr->nnz_s = nnz; - csr->row_s = dimM+1; + csr->row_s = dimM + 1; csr->sparsity = -1.0; *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csr; } else if (format == HL_SPARSE_CSC) { CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; - size_t size_ = (dimN+1)*sizeof(int) + nnz*sizeof(int); + size_t size_ = (dimN + 1) * sizeof(int) + nnz * sizeof(int); if (value_type != HL_NO_VALUE) { - size_ += nnz*sizeof(real); + size_ += nnz * sizeof(real); } CHECK_LE(size_, size) << "dest_d size(" << size - << ") too small, should bigger than(" << size_ << ")!"; + << ") too small, should bigger than(" << size_ + << ")!"; - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csc_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix)); CHECK_NOTNULL(tmp); - hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); + hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); if (value_type == HL_NO_VALUE) { csc->csc_val = NULL; - csc->csc_col = (int*)dest_d; - csc->csc_row = (int*)((char*)dest_d + (dimN+1)*sizeof(int)); + csc->csc_col = (int *)dest_d; + csc->csc_row = (int *)((char *)dest_d + (dimN + 1) * sizeof(int)); } else { - csc->csc_val = (real*)dest_d; - csc->csc_col = (int*)((char*)dest_d + nnz*sizeof(real)); - csc->csc_row = (int*)((char*)dest_d + - nnz*sizeof(real) + - (dimN+1)*sizeof(int)); + csc->csc_val = (real *)dest_d; + csc->csc_col = (int *)((char *)dest_d + nnz * sizeof(real)); + csc->csc_row = (int *)((char *)dest_d + nnz * sizeof(real) + + (dimN + 1) * sizeof(int)); } csc->nnz_s = nnz; - csc->col_s = dimN+1; + csc->col_s = dimN + 1; csc->sparsity = -1.0f; *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csc; @@ -333,11 +312,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, } void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, - real* value_d, - int* rows_d, - int* cols_d, + real *value_d, + int *rows_d, + int *cols_d, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz) { @@ -345,11 +324,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC) - << "sparse matrix format error!"; + << "sparse matrix format error!"; if (format == HL_SPARSE_CSR) { - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csr_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix)); CHECK_NOTNULL(tmp); hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); @@ -362,8 +341,8 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csr; } else if (format == HL_SPARSE_CSC) { - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csc_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix)); CHECK_NOTNULL(tmp); hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); @@ -396,35 +375,30 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix, hl_stream_t stream) { CHECK_NOTNULL(csr_matrix); CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR) - << "csr_matrix is not csr format!"; + << "csr_matrix is not csr format!"; CHECK_NOTNULL(csr_matrix->matrix); hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix); - CHECK_LE(csr_matrix->nnz, csr->nnz_s) - << "copy size " << csr_matrix->nnz - << " is big than alloc size " << csr->nnz_s; + CHECK_LE(csr_matrix->nnz, csr->nnz_s) << "copy size " << csr_matrix->nnz + << " is big than alloc size " + << csr->nnz_s; - CHECK_LE((csr_matrix->rows+1), csr->row_s) - << "copy size " << (csr_matrix->rows + 1) - << " is big than alloc size " << csr->row_s; + CHECK_LE((csr_matrix->rows + 1), csr->row_s) + << "copy size " << (csr_matrix->rows + 1) << " is big than alloc size " + << csr->row_s; - CHECK(csr_matrix->type == HL_FLOAT_VALUE || - csr_matrix->type == HL_NO_VALUE) - << "sparse matrix value type error!"; + CHECK(csr_matrix->type == HL_FLOAT_VALUE || csr_matrix->type == HL_NO_VALUE) + << "sparse matrix value type error!"; if (csr_matrix->type == HL_NO_VALUE) { if (csr_row == NULL && csr_col == NULL) { return; } else if (csr_row != NULL && csr_col != NULL) { - hl_memcpy_async(csr->csr_row, - csr_row, - (csr_matrix->rows+1)*sizeof(int), - stream); + hl_memcpy_async( + csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream); - hl_memcpy_async(csr->csr_col, - csr_col, - (csr_matrix->nnz)*sizeof(int), - stream); + hl_memcpy_async( + csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream); } else { LOG(FATAL) << "parameter csr_row or csr_col is null pointer!"; } @@ -432,30 +406,21 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix, if (csr_val == NULL && csr_row == NULL && csr_col == NULL) { return; } else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) { - hl_memcpy_async(csr->csr_val, - csr_val, - (csr_matrix->nnz)*sizeof(real), - stream); + hl_memcpy_async( + csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream); } else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) { - hl_memcpy_async(csr->csr_val, - csr_val, - (csr_matrix->nnz)*sizeof(real), - stream); - hl_memcpy_async(csr->csr_row, - csr_row, - (csr_matrix->rows+1)*sizeof(int), - stream); - hl_memcpy_async(csr->csr_col, - csr_col, - (csr_matrix->nnz)*sizeof(int), - stream); + hl_memcpy_async( + csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream); + hl_memcpy_async( + csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream); + hl_memcpy_async( + csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream); } else { LOG(FATAL) << "parameter csr_row or csr_col is null pointer!"; } } - csr->sparsity = ((float)csr_matrix->nnz) / - ((float)csr_matrix->rows) / + csr->sparsity = ((float)csr_matrix->nnz) / ((float)csr_matrix->rows) / ((float)csr_matrix->cols); } @@ -466,33 +431,28 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix, hl_stream_t stream) { CHECK_NOTNULL(csc_matrix); CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC) - << "csc_matrix is not csc format error!"; + << "csc_matrix is not csc format error!"; hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix); - CHECK_LE(csc_matrix->nnz, csc->nnz_s) - << "copy size " << csc_matrix->nnz - << " is big than alloc size " << csc->nnz_s; + CHECK_LE(csc_matrix->nnz, csc->nnz_s) << "copy size " << csc_matrix->nnz + << " is big than alloc size " + << csc->nnz_s; - CHECK_LE((csc_matrix->cols+1), csc->col_s) - << "copy size " <<(csc_matrix->cols + 1) - << " is big than alloc size " << csc->col_s; + CHECK_LE((csc_matrix->cols + 1), csc->col_s) + << "copy size " << (csc_matrix->cols + 1) << " is big than alloc size " + << csc->col_s; - CHECK(csc_matrix->type == HL_FLOAT_VALUE || - csc_matrix->type == HL_NO_VALUE) - << "sparse matrix value type error!"; + CHECK(csc_matrix->type == HL_FLOAT_VALUE || csc_matrix->type == HL_NO_VALUE) + << "sparse matrix value type error!"; if (csc_matrix->type == HL_NO_VALUE) { if (csc_row == NULL && csc_col == NULL) { return; } else if (csc_row != NULL && csc_col != NULL) { - hl_memcpy_async(csc->csc_row, - csc_row, - (csc_matrix->nnz)*sizeof(int), - stream); - hl_memcpy_async(csc->csc_col, - csc_col, - (csc_matrix->cols+1)*sizeof(int), - stream); + hl_memcpy_async( + csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream); + hl_memcpy_async( + csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream); } else { LOG(FATAL) << "parameter csc_row or csc_col is null pointer!"; } @@ -500,30 +460,21 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix, if (csc_val == NULL && csc_row == NULL && csc_col == NULL) { return; } else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) { - hl_memcpy_async(csc->csc_val, - csc_val, - (csc_matrix->nnz)*sizeof(real), - stream); + hl_memcpy_async( + csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream); } else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) { - hl_memcpy_async(csc->csc_val, - csc_val, - (csc_matrix->nnz)*sizeof(real), - stream); - hl_memcpy_async(csc->csc_row, - csc_row, - (csc_matrix->nnz)*sizeof(int), - stream); - hl_memcpy_async(csc->csc_col, - csc_col, - (csc_matrix->cols+1)*sizeof(int), - stream); + hl_memcpy_async( + csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream); + hl_memcpy_async( + csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream); + hl_memcpy_async( + csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream); } else { LOG(FATAL) << "parameter csc_row or csc_col is null pointer!"; } } - csc->sparsity = ((float)csc_matrix->nnz) / - ((float)csc_matrix->rows) / + csc->sparsity = ((float)csc_matrix->nnz) / ((float)csc_matrix->rows) / ((float)csc_matrix->cols); } @@ -531,32 +482,23 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst, hl_sparse_matrix_s src, hl_stream_t stream) { CHECK(dst && src && dst->matrix && src->matrix) - << "parameter dst or src is null pointer!"; - CHECK_EQ(dst->format, src->format) - << "sparse matrix format does not match!"; + << "parameter dst or src is null pointer!"; + CHECK_EQ(dst->format, src->format) << "sparse matrix format does not match!"; CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE) - << "src sparse matrix is no value, dst sparse matrix has value!"; + << "src sparse matrix is no value, dst sparse matrix has value!"; if (dst->format == HL_SPARSE_CSR) { dst->rows = src->rows; dst->cols = src->cols; - dst->nnz = src->nnz; + dst->nnz = src->nnz; hl_csr_matrix csr = (hl_csr_matrix)src->matrix; - hl_memcpy_csr_matrix(dst, - csr->csr_val, - csr->csr_row, - csr->csr_col, - stream); + hl_memcpy_csr_matrix(dst, csr->csr_val, csr->csr_row, csr->csr_col, stream); } else if (dst->format == HL_SPARSE_CSC) { dst->rows = src->rows; dst->cols = src->cols; - dst->nnz = src->nnz; + dst->nnz = src->nnz; hl_csc_matrix csc = (hl_csc_matrix)src->matrix; - hl_memcpy_csc_matrix(dst, - csc->csc_val, - csc->csc_row, - csc->csc_col, - stream); + hl_memcpy_csc_matrix(dst, csc->csc_val, csc->csc_row, csc->csc_col, stream); } else { LOG(FATAL) << "sparse matrix format error!"; } @@ -569,20 +511,24 @@ static void _beta_mul_c(real *c, int dimM, int dimN, real beta) { if (beta == 0.0) { hl_gpu_apply_unary_op(unary::Zero(), c, dimM, dimN, dimN); } else { - if (beta != 1.0){ - hl_gpu_apply_unary_op( - unary::mul_scalar(beta), c, dimM, dimN, dimN); + if (beta != 1.0) { + hl_gpu_apply_unary_op(unary::mul_scalar(beta), c, dimM, dimN, dimN); } } return; } -void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, +void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_EQ(transb, HPPL_OP_N); CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -592,7 +538,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) || (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) { - LOG(FATAL) << "parameter error!"; + LOG(FATAL) << "parameter error!"; } if (A_d->nnz == 0) { @@ -603,8 +549,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, /* nnz != 0 */ hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix); if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) || - A_d2->csr_row == NULL || - A_d2->csr_col == NULL) { + A_d2->csr_row == NULL || A_d2->csr_col == NULL) { LOG(FATAL) << "parameter error!"; } @@ -617,63 +562,63 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, /* sparsity pattern */ // A_d->sparsity; if (A_d->type == HL_NO_VALUE) { - KeSMatrixCsrMulDense<0> - <<>>(C_d, - A_d2->csr_val, - A_d2->csr_col, - A_d2->csr_row, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCsrMulDense<0><<>>( + C_d, + A_d2->csr_val, + A_d2->csr_col, + A_d2->csr_row, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixCsrMulDense<1> - <<>>(C_d, - A_d2->csr_val, - A_d2->csr_col, - A_d2->csr_row, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCsrMulDense<1><<>>( + C_d, + A_d2->csr_val, + A_d2->csr_col, + A_d2->csr_row, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } } else if (HPPL_OP_T == transa) { _beta_mul_c(C_d, dimM, dimN, beta); - int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / - CU_CSC_MUL_DENSE_BLOCK_N; - int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / - CU_CSC_MUL_DENSE_BLOCK_K; + int blocksX = + (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N; + int blocksY = + (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K; dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y); dim3 grid(blocksX, blocksY); if (A_d->type == HL_NO_VALUE) { - KeSMatrixCscMulDense<0> - <<>>(C_d, - A_d2->csr_val, - A_d2->csr_col, - A_d2->csr_row, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCscMulDense<0><<>>( + C_d, + A_d2->csr_val, + A_d2->csr_col, + A_d2->csr_row, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixCscMulDense<1> - <<>>(C_d, - A_d2->csr_val, - A_d2->csr_col, - A_d2->csr_row, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCscMulDense<1><<>>( + C_d, + A_d2->csr_val, + A_d2->csr_col, + A_d2->csr_row, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } } else { LOG(FATAL) << "parameter transa error!"; @@ -682,11 +627,16 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, CHECK_SYNC("hl_matrix_csr_mul_dense failed"); } -void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, - hl_sparse_matrix_s B_d, hl_trans_op_t transb, +void hl_matrix_dense_mul_csc(real *A_d, + hl_trans_op_t transa, + hl_sparse_matrix_s B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_EQ(transa, HPPL_OP_N); CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -698,8 +648,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, LOG(FATAL) << "parameter dims error!"; } - CHECK_EQ(B_d->format, HL_SPARSE_CSC) - << "matrix format error!"; + CHECK_EQ(B_d->format, HL_SPARSE_CSC) << "matrix format error!"; if (B_d->nnz == 0) { _beta_mul_c(C_d, dimM, dimN, beta); @@ -709,8 +658,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, /* nnz != 0 */ hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix); if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) || - B_d2->csc_row == NULL || - B_d2->csc_col == NULL) { + B_d2->csc_row == NULL || B_d2->csc_col == NULL) { LOG(FATAL) << "parameter B is null!"; } @@ -721,60 +669,60 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, dim3 grid(blocksX, blocksY); if (B_d->type == HL_NO_VALUE) { - KeSMatrixDenseMulCsc<0> - <<>>(C_d, - A_d, - B_d2->csc_val, - B_d2->csc_row, - B_d2->csc_col, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsc<0><<>>( + C_d, + A_d, + B_d2->csc_val, + B_d2->csc_row, + B_d2->csc_col, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixDenseMulCsc<1> - <<>>(C_d, - A_d, - B_d2->csc_val, - B_d2->csc_row, - B_d2->csc_col, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsc<1><<>>( + C_d, + A_d, + B_d2->csc_val, + B_d2->csc_row, + B_d2->csc_col, + dimM, + dimN, + dimK, + alpha, + beta); } } else if (transb == HPPL_OP_T) { _beta_mul_c(C_d, dimM, dimN, beta); - int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X; - int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M; + int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X; + int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M; dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y); dim3 grid(blocksX, blocksY); if (B_d->type == HL_NO_VALUE) { - KeSMatrixDenseMulCsr<0> - <<>>(C_d, - A_d, - B_d2->csc_val, - B_d2->csc_col, - B_d2->csc_row, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsr<0><<>>( + C_d, + A_d, + B_d2->csc_val, + B_d2->csc_col, + B_d2->csc_row, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixDenseMulCsr<1> - <<>>(C_d, - A_d, - B_d2->csc_val, - B_d2->csc_col, - B_d2->csc_row, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsr<1><<>>( + C_d, + A_d, + B_d2->csc_val, + B_d2->csc_col, + B_d2->csc_row, + dimM, + dimN, + dimK, + alpha, + beta); } } else { LOG(FATAL) << "parameter transb error!"; @@ -783,24 +731,28 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, CHECK_SYNC("hl_matrix_dense_mul_csc failed"); } -void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, - hl_sparse_matrix_s B_d, hl_trans_op_t transb, +void hl_matrix_dense_mul_csr(real *A_d, + hl_trans_op_t transa, + hl_sparse_matrix_s B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_EQ(transa, HPPL_OP_N); CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); CHECK_NOTNULL(C_d); - if (dimM <= 0 || dimN <= 0 || dimK <= 0 - || (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) - || (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) { + if (dimM <= 0 || dimN <= 0 || dimK <= 0 || + (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) || + (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) { LOG(FATAL) << "parameter dims error!"; } - CHECK_EQ(B_d->format, HL_SPARSE_CSR) - << "matrix format error!"; + CHECK_EQ(B_d->format, HL_SPARSE_CSR) << "matrix format error!"; if (B_d->nnz == 0) { _beta_mul_c(C_d, dimM, dimN, beta); @@ -810,41 +762,40 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, /* nnz != 0 */ hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix); if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) || - B_d2->csr_row == NULL || - B_d2->csr_col == NULL) { + B_d2->csr_row == NULL || B_d2->csr_col == NULL) { LOG(FATAL) << "parameter transa error!"; } if (transb == HPPL_OP_N) { _beta_mul_c(C_d, dimM, dimN, beta); - int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X; - int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M; + int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X; + int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M; dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y); dim3 grid(blocksX, blocksY); if (B_d->type == HL_NO_VALUE) { - KeSMatrixDenseMulCsr<0> - <<>>(C_d, - A_d, - B_d2->csr_val, - B_d2->csr_row, - B_d2->csr_col, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsr<0><<>>( + C_d, + A_d, + B_d2->csr_val, + B_d2->csr_row, + B_d2->csr_col, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixDenseMulCsr<1> - <<>>(C_d, - A_d, - B_d2->csr_val, - B_d2->csr_row, - B_d2->csr_col, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsr<1><<>>( + C_d, + A_d, + B_d2->csr_val, + B_d2->csr_row, + B_d2->csr_col, + dimM, + dimN, + dimK, + alpha, + beta); } } else if (transb == HPPL_OP_T) { int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST; @@ -852,29 +803,29 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST); dim3 grid(blocksX, blocksY); if (B_d->type == HL_NO_VALUE) { - KeSMatrixDenseMulCsc<0> - <<>>(C_d, - A_d, - B_d2->csr_val, - B_d2->csr_col, - B_d2->csr_row, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsc<0><<>>( + C_d, + A_d, + B_d2->csr_val, + B_d2->csr_col, + B_d2->csr_row, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixDenseMulCsc<1> - <<>>(C_d, - A_d, - B_d2->csr_val, - B_d2->csr_col, - B_d2->csr_row, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsc<1><<>>( + C_d, + A_d, + B_d2->csr_val, + B_d2->csr_col, + B_d2->csr_row, + dimM, + dimN, + dimK, + alpha, + beta); } } else { LOG(FATAL) << "parameter transb error!"; @@ -883,11 +834,16 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, CHECK_SYNC("hl_matrix_dense_mul_csr failed"); } -void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, +void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_EQ(transb, HPPL_OP_N); CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -908,42 +864,43 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, /* nnz != 0 */ hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix); if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) || - A_d2->csc_row == NULL || - A_d2->csc_col == NULL) { + A_d2->csc_row == NULL || A_d2->csc_col == NULL) { LOG(FATAL) << "parameter error!"; } if (HPPL_OP_N == transa) { _beta_mul_c(C_d, dimM, dimN, beta); - int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N -1)/CU_CSC_MUL_DENSE_BLOCK_N; - int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K -1)/CU_CSC_MUL_DENSE_BLOCK_K; + int blocksX = + (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N; + int blocksY = + (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K; dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y); dim3 grid(blocksX, blocksY); if (A_d->type == HL_NO_VALUE) { - KeSMatrixCscMulDense<0> - <<>>(C_d, - A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCscMulDense<0><<>>( + C_d, + A_d2->csc_val, + A_d2->csc_row, + A_d2->csc_col, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixCscMulDense<1> - <<>>(C_d, - A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCscMulDense<1><<>>( + C_d, + A_d2->csc_val, + A_d2->csc_row, + A_d2->csc_col, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } } else if (HPPL_OP_T == transa) { int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N; @@ -954,29 +911,29 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, /* sparsity pattern */ // A_d->sparsity; if (A_d->type == HL_NO_VALUE) { - KeSMatrixCsrMulDense<0> - <<>>(C_d, - A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCsrMulDense<0><<>>( + C_d, + A_d2->csc_val, + A_d2->csc_row, + A_d2->csc_col, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixCsrMulDense<1> - <<>>(C_d, - A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCsrMulDense<1><<>>( + C_d, + A_d2->csc_val, + A_d2->csc_row, + A_d2->csc_col, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } } else { LOG(FATAL) << "parameter transa error!"; @@ -985,11 +942,16 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, CHECK_SYNC("hl_matrix_csc_mul_dense failed"); } -void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, - hl_sparse_matrix_s C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { +void hl_sparse_matrix_mul(real *A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, + hl_sparse_matrix_s C_d, + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); CHECK_NOTNULL(C_d); @@ -1000,18 +962,14 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, if (C_d->format == HL_SPARSE_CSC) { hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix); - if (C_d2->csc_val == NULL || - C_d2->csc_row == NULL || + if (C_d2->csc_val == NULL || C_d2->csc_row == NULL || C_d2->csc_col == NULL) { LOG(FATAL) << "parameter error!"; } if (beta != 1.0) { - hl_gpu_apply_unary_op(unary::mul_scalar(beta), - C_d2->csc_val, - 1, - C_d->nnz, - C_d->nnz); + hl_gpu_apply_unary_op( + unary::mul_scalar(beta), C_d2->csc_val, 1, C_d->nnz, C_d->nnz); } int blocksX = dimN; @@ -1020,34 +978,30 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, dim3 grid(blocksX, blocksY); bool transA = transa == HPPL_OP_T ? 1 : 0; bool transB = transb == HPPL_OP_T ? 1 : 0; - KeSMatrixDenseMulDense2CSC - <<>>(C_d2->csc_val, - C_d2->csc_row, - C_d2->csc_col, - A_d, - B_d, - transA, - transB, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulDense2CSC<<>>( + C_d2->csc_val, + C_d2->csc_row, + C_d2->csc_col, + A_d, + B_d, + transA, + transB, + dimM, + dimN, + dimK, + alpha, + beta); CHECK_SYNC("hl_sparse_matrix_mul failed"); } else { hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix); if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) || - C_d2->csr_row == NULL || - C_d2->csr_col == NULL) { + C_d2->csr_row == NULL || C_d2->csr_col == NULL) { LOG(FATAL) << "parameter error!"; } if (beta != 1.0) { - hl_gpu_apply_unary_op(unary::mul_scalar(beta), - C_d2->csr_val, - 1, - C_d->nnz, - C_d->nnz); + hl_gpu_apply_unary_op( + unary::mul_scalar(beta), C_d2->csr_val, 1, C_d->nnz, C_d->nnz); } bool transA = transa == HPPL_OP_T ? 1 : 0; @@ -1058,20 +1012,20 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1); dim3 grid(blocksX, blocksY); - KeSMatrixDenseMulDense2CSR - <<>>(C_d2->csr_val, - C_d2->csr_row, - C_d2->csr_col, - A_d, - B_d, - transA, - transB, - dimM, - dimN, - dimK, - alpha, - beta); - CHECK_SYNC("hl_sparse_matrix_mul failed"); + KeSMatrixDenseMulDense2CSR<<>>( + C_d2->csr_val, + C_d2->csr_row, + C_d2->csr_col, + A_d, + B_d, + transA, + transB, + dimM, + dimN, + dimK, + alpha, + beta); + CHECK_SYNC("hl_sparse_matrix_mul failed"); } else { CHECK(!transA) << "Not supported A is trans and B is not trans!"; @@ -1080,21 +1034,21 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1; int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE); dim3 grid(gridx, dimM); - KeSMatrixDenseMulDenseTrans2CSR - <<>>(C_d2->csr_val, - C_d2->csr_row, - C_d2->csr_col, - A_d, - B_d, - transA, - transB, - dimM, - dimN, - dimK, - alpha, - beta); - CHECK_SYNC("hl_sparse_matrix_mul failed"); - } + KeSMatrixDenseMulDenseTrans2CSR<<>>( + C_d2->csr_val, + C_d2->csr_row, + C_d2->csr_col, + A_d, + B_d, + transA, + transB, + dimM, + dimN, + dimK, + alpha, + beta); + CHECK_SYNC("hl_sparse_matrix_mul failed"); + } } } @@ -1111,7 +1065,7 @@ void hl_memcpy_from_csc_matrix(real *csc_val, CHECK_NOTNULL(csc_col); CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC) - << "csc_matrix is not csc format error!"; + << "csc_matrix is not csc format error!"; if (csc_matrix->nnz > row_size || csc_matrix->cols + 1 > static_cast(col_size)) { @@ -1119,20 +1073,20 @@ void hl_memcpy_from_csc_matrix(real *csc_val, } hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix); - hl_memcpy_async((void*)csc_row, - (void*)csc->csc_row, + hl_memcpy_async((void *)csc_row, + (void *)csc->csc_row, (csc_matrix->nnz) * sizeof(int), stream); - hl_memcpy_async((void*)csc_col, - (void*)csc->csc_col, + hl_memcpy_async((void *)csc_col, + (void *)csc->csc_col, (csc_matrix->cols + 1) * sizeof(int), stream); if (csc_matrix->type == HL_FLOAT_VALUE) { if (csc_val != NULL) { CHECK_LE(csc_matrix->nnz, val_size) << "size not match!"; - hl_memcpy_async((void*)csc_val, - (void*)csc->csc_val, - (csc_matrix->nnz)*sizeof(real), + hl_memcpy_async((void *)csc_val, + (void *)csc->csc_val, + (csc_matrix->nnz) * sizeof(real), stream); } else { LOG(FATAL) << "parameter csr_val is null pointer!"; @@ -1152,7 +1106,7 @@ void hl_memcpy_from_csr_matrix(real *csr_val, CHECK_NOTNULL(csr_row); CHECK_NOTNULL(csr_col); CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR) - << "csr_matrix is not csr format error!"; + << "csr_matrix is not csr format error!"; if (csr_matrix->nnz > col_size || csr_matrix->rows + 1 > static_cast(row_size)) { @@ -1160,20 +1114,20 @@ void hl_memcpy_from_csr_matrix(real *csr_val, } hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix); - hl_memcpy_async((void*)csr_row, - (void*)csr->csr_row, - (csr_matrix->rows+1)*sizeof(int), + hl_memcpy_async((void *)csr_row, + (void *)csr->csr_row, + (csr_matrix->rows + 1) * sizeof(int), stream); - hl_memcpy_async((void*)csr_col, - (void*)csr->csr_col, - (csr_matrix->nnz)*sizeof(int), + hl_memcpy_async((void *)csr_col, + (void *)csr->csr_col, + (csr_matrix->nnz) * sizeof(int), stream); if (csr_matrix->type == HL_FLOAT_VALUE) { if (csr_val != NULL) { CHECK_LE(csr_matrix->nnz, val_size) << "size not match!"; - hl_memcpy_async((void*)csr_val, - (void*)csr->csr_val, - (csr_matrix->nnz)*sizeof(real), + hl_memcpy_async((void *)csr_val, + (void *)csr->csr_val, + (csr_matrix->nnz) * sizeof(real), stream); } else { LOG(FATAL) << "parameter csr_val is null pointer!"; @@ -1181,8 +1135,8 @@ void hl_memcpy_from_csr_matrix(real *csr_val, } } -void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM, - int dimN, real scale) { +void hl_sparse_matrix_column_sum( + real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) { if (B_d->format == HL_SPARSE_CSR) { hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale); } else { @@ -1190,8 +1144,8 @@ void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM, } } -void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d, - int dimM, int dimN, real scale) { +void hl_matrix_csr_column_sum( + real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -1216,8 +1170,7 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d, CHECK_SYNC("hl_matrix_csr_column_sum failed"); } -void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, - real* B_d, real scale) { +void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) { if (A_d->format == HL_SPARSE_CSR) { hl_matrix_csr_add_bias(A_d, B_d, scale); } else { @@ -1225,8 +1178,7 @@ void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, } } -void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d, - real scale) { +void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -1247,8 +1199,12 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d, CHECK_SYNC("hl_sparse_matrix_add_bias failed"); } -void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM, - int dimN, real alpha, real beta) { +void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, + real *B_d, + int dimM, + int dimN, + real alpha, + real beta) { if (A_d->format == HL_SPARSE_CSR) { hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta); } else { @@ -1256,8 +1212,12 @@ void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM, } } -void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM, - int dimN, real alpha, real beta) { +void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, + real *B_d, + int dimM, + int dimN, + real alpha, + real beta) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -1277,20 +1237,26 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM, gridX = gridX > 0 ? gridX : 1; dim3 block(512, 1); dim3 grid(gridX, dimM); - KeSMatrixCsrAddDense<<>>( - A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, B_d, alpha, beta, dimM, dimN); + KeSMatrixCsrAddDense<<>>(A_d2->csr_val, + A_d2->csr_row, + A_d2->csr_col, + B_d, + alpha, + beta, + dimM, + dimN); CHECK_SYNC("hl_sparse_matrix_add_dense failed"); } -int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { +int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { __sparse_get_return__(sMat, row); } -int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { +int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { __sparse_get_return__(sMat, col); } -real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) { +real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) { __sparse_get_return__(sMat, val); } diff --git a/paddle/cuda/src/hl_perturbation_util.cu b/paddle/cuda/src/hl_perturbation_util.cu index 2a945bcdb87fe49c121890128ef77b084ebe8e60..d01a91561efa2ebe8e0cabc2b4e8885f2c02ab48 100644 --- a/paddle/cuda/src/hl_perturbation_util.cu +++ b/paddle/cuda/src/hl_perturbation_util.cu @@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#include #include -#include "hl_cuda.h" -#include "hl_time.h" +#include #include "hl_base.h" +#include "hl_cuda.h" #include "hl_perturbation_util.cuh" +#include "hl_time.h" #define _USE_MATH_DEFINES @@ -30,10 +29,16 @@ limitations under the License. */ * centerX, centerY: translation. * sourceX, sourceY: output coordinates in the original image. */ -__device__ void getTranformCoord(int x, int y, real theta, real scale, - real tgtCenter, real imgCenter, - real centerR, real centerC, - int* sourceX, int* sourceY) { +__device__ void getTranformCoord(int x, + int y, + real theta, + real scale, + real tgtCenter, + real imgCenter, + real centerR, + real centerC, + int* sourceX, + int* sourceY) { real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)}; // compute coornidates in the rotated and scaled image @@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale, * created by Wei Xu (genome), converted by Jiang Wang */ -__global__ void kSamplingPatches(const real* imgs, real* targets, - int imgSize, int tgtSize, const int channels, - int samplingRate, const real* thetas, - const real* scales, const int* centerRs, - const int* centerCs, const real padValue, +__global__ void kSamplingPatches(const real* imgs, + real* targets, + int imgSize, + int tgtSize, + const int channels, + int samplingRate, + const real* thetas, + const real* scales, + const int* centerRs, + const int* centerCs, + const real padValue, const int numImages) { const int caseIdx = blockIdx.x * 4 + threadIdx.x; const int pxIdx = blockIdx.y * 128 + threadIdx.y; @@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, const int pxY = pxIdx / tgtSize; int srcPxX, srcPxY; - getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter, - imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX, + getTranformCoord(pxX, + pxY, + thetas[imgIdx], + scales[imgIdx], + tgtCenter, + imgCenter, + centerCs[caseIdx], + centerRs[caseIdx], + &srcPxX, &srcPxY); imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels; @@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, * * created by Wei Xu */ -void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, - int*& gpuCenterR, int*& gpuCenterC, - int numImages, int imgSize, real rotateAngle, - real scaleRatio, int samplingRate, +void hl_generate_disturb_params(real*& gpuAngle, + real*& gpuScaleRatio, + int*& gpuCenterR, + int*& gpuCenterC, + int numImages, + int imgSize, + real rotateAngle, + real scaleRatio, + int samplingRate, bool isTrain) { // The number of output samples. int numPatches = numImages * samplingRate; @@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, for (int i = 0; i < numImages; i++) { r_angle[i] = (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT - - 0.5); + - + 0.5); s_ratio[i] = 1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT } @@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, int pxY = (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT - const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]), - sin(-r_angle[i]), cos(-r_angle[i])}; + const real H[4] = {cos(-r_angle[i]), + -sin(-r_angle[i]), + sin(-r_angle[i]), + cos(-r_angle[i])}; real x = pxX - imgCenter; real y = pxY - imgCenter; real xx = H[0] * x + H[1] * y; @@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, delete[] center_c; } -void hl_conv_random_disturb_with_params(const real* images, int imgSize, - int tgtSize, int channels, - int numImages, int samplingRate, +void hl_conv_random_disturb_with_params(const real* images, + int imgSize, + int tgtSize, + int channels, + int numImages, + int samplingRate, const real* gpuRotationAngle, const real* gpuScaleRatio, const int* gpuCenterR, @@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize, dim3 threadsPerBlock(4, 128); dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128)); - kSamplingPatches <<>> - (images, target, imgSize, tgtSize, channels, samplingRate, - gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, - paddingValue, numImages); + kSamplingPatches<<>>(images, + target, + imgSize, + tgtSize, + channels, + samplingRate, + gpuRotationAngle, + gpuScaleRatio, + gpuCenterR, + gpuCenterC, + paddingValue, + numImages); hl_device_synchronize(); } -void hl_conv_random_disturb(const real* images, int imgSize, - int tgtSize, int channels, int numImages, - real scaleRatio, real rotateAngle, - int samplingRate, real* gpu_r_angle, - real* gpu_s_ratio, int* gpu_center_r, - int* gpu_center_c, int paddingValue, - bool isTrain, real* targets) { +void hl_conv_random_disturb(const real* images, + int imgSize, + int tgtSize, + int channels, + int numImages, + real scaleRatio, + real rotateAngle, + int samplingRate, + real* gpu_r_angle, + real* gpu_s_ratio, + int* gpu_center_r, + int* gpu_center_c, + int paddingValue, + bool isTrain, + real* targets) { // generate the random disturbance sequence and the sampling locations - hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r, - gpu_center_c, numImages, imgSize, rotateAngle, - scaleRatio, samplingRate, isTrain); - - hl_conv_random_disturb_with_params( - images, imgSize, tgtSize, channels, numImages, - samplingRate, gpu_r_angle, gpu_s_ratio, - gpu_center_r, gpu_center_r, paddingValue, - targets); + hl_generate_disturb_params(gpu_r_angle, + gpu_s_ratio, + gpu_center_r, + gpu_center_c, + numImages, + imgSize, + rotateAngle, + scaleRatio, + samplingRate, + isTrain); + + hl_conv_random_disturb_with_params(images, + imgSize, + tgtSize, + channels, + numImages, + samplingRate, + gpu_r_angle, + gpu_s_ratio, + gpu_center_r, + gpu_center_r, + paddingValue, + targets); } diff --git a/paddle/cuda/src/hl_table_apply.cu b/paddle/cuda/src/hl_table_apply.cu index 61edbe3ccc7028fd8779c4119f33c4cb5afe0564..d3b71c75e6e69d48c8d98041e3d6075aa8d53610 100644 --- a/paddle/cuda/src/hl_table_apply.cu +++ b/paddle/cuda/src/hl_table_apply.cu @@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "hl_base.h" -#include "hl_device_functions.cuh" #include "hl_cuda.h" +#include "hl_device_functions.cuh" #include "paddle/utils/Logging.h" -template -__global__ void KeMatrixAddRows(real* output, int ldo, - real* table, int ldt, +template +__global__ void KeMatrixAddRows(real* output, + int ldo, + real* table, + int ldt, int* ids, int numSamples, int tableSize, @@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo, while (idy < numSamples) { int tableId = ids[idy]; if ((0 <= tableId) && (tableId < tableSize)) { - real *out = output + idy * ldo; - real *tab = table + tableId * ldt; + real* out = output + idy * ldo; + real* tab = table + tableId * ldt; for (int i = idx; i < dim; i += blockDimX) { if (AddRow) { paddle::paddleAtomicAdd(&tab[i], out[i]); @@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo, } } -void hl_matrix_select_rows(real* output, int ldo, - real* table, int ldt, +void hl_matrix_select_rows(real* output, + int ldo, + real* table, + int ldt, int* ids, int numSamples, int tableSize, @@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo, dim3 threads(128, 8); dim3 grid(8, 1); - KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>> - (output, ldo, table, ldt, ids, numSamples, tableSize, dim); + KeMatrixAddRows<128, 8, 8, 0><<>>( + output, ldo, table, ldt, ids, numSamples, tableSize, dim); CHECK_SYNC("hl_matrix_select_rows failed"); } -void hl_matrix_add_to_rows(real* table, int ldt, - real* input, int ldi, +void hl_matrix_add_to_rows(real* table, + int ldt, + real* input, + int ldi, int* ids, int numSamples, int tableSize, @@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt, dim3 threads(128, 8); dim3 grid(8, 1); - KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>> - (input, ldi, table, ldt, ids, numSamples, tableSize, dim); + KeMatrixAddRows<128, 8, 8, 1><<>>( + input, ldi, table, ldt, ids, numSamples, tableSize, dim); CHECK_SYNC("hl_matrix_add_to_rows failed"); } -template -__global__ void KeVectorSelect(T* dst, int sized, - const T* src, int sizes, - const int* ids, int sizei) { +template +__global__ void KeVectorSelect( + T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) { int idx = threadIdx.x + blockDimX * blockIdx.x; while (idx < sizei) { int index = ids[idx]; @@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized, } template -void hl_vector_select_from(T* dst, int sized, - const T* src, int sizes, - const int* ids, int sizei) { +void hl_vector_select_from( + T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) { CHECK_NOTNULL(dst); CHECK_NOTNULL(src); CHECK_NOTNULL(ids); @@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized, dim3 threads(512, 1); dim3 grid(8, 1); - KeVectorSelect<<< grid, threads, 0, STREAM_DEFAULT >>> - (dst, sized, src, sizes, ids, sizei); + KeVectorSelect<<>>( + dst, sized, src, sizes, ids, sizei); CHECK_SYNC("hl_vector_select_from failed"); } -template -void hl_vector_select_from(real* dst, int sized, - const real* src, int sizes, - const int* ids, int sizei); -template -void hl_vector_select_from(int* dst, int sized, - const int* src, int sizes, - const int* ids, int sizei); - +template void hl_vector_select_from(real* dst, + int sized, + const real* src, + int sizes, + const int* ids, + int sizei); +template void hl_vector_select_from( + int* dst, int sized, const int* src, int sizes, const int* ids, int sizei); diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu index 4f0bbfcf4e3aa51dd06acf254af65c62098a1df7..1896a56634c3a75e5a2a1e08661088b263f8ee10 100644 --- a/paddle/cuda/src/hl_top_k.cu +++ b/paddle/cuda/src/hl_top_k.cu @@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "hl_base.h" -#include "hl_top_k.h" #include "hl_sparse.ph" +#include "hl_top_k.h" #include "paddle/utils/Logging.h" // using namespace hppl; struct Pair { - __device__ __forceinline__ - Pair() {} + __device__ __forceinline__ Pair() {} - __device__ __forceinline__ - Pair(real value, int id) : v_(value), id_(id) {} + __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {} - __device__ __forceinline__ - void set(real value, int id) { + __device__ __forceinline__ void set(real value, int id) { v_ = value; id_ = id; } - __device__ __forceinline__ - void operator=(const Pair& in) { + __device__ __forceinline__ void operator=(const Pair& in) { v_ = in.v_; id_ = in.id_; } - __device__ __forceinline__ - bool operator<(const real value) const { + __device__ __forceinline__ bool operator<(const real value) const { return (v_ < value); } - __device__ __forceinline__ - bool operator<(const Pair& in) const { + __device__ __forceinline__ bool operator<(const Pair& in) const { return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_)); } - __device__ __forceinline__ - bool operator>(const Pair& in) const { + __device__ __forceinline__ bool operator>(const Pair& in) const { return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_)); } @@ -58,8 +50,9 @@ struct Pair { int id_; }; -__device__ __forceinline__ -void addTo(Pair topK[], const Pair &p, int beamSize) { +__device__ __forceinline__ void addTo(Pair topK[], + const Pair& p, + int beamSize) { for (int k = beamSize - 2; k >= 0; k--) { if (topK[k] < p) { topK[k + 1] = topK[k]; @@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) { topK[0] = p; } -template -__device__ __forceinline__ -void addTo(Pair topK[], const Pair &p) { +template +__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) { for (int k = beamSize - 2; k >= 0; k--) { if (topK[k] < p) { topK[k + 1] = topK[k]; @@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) { topK[0] = p; } -template -__device__ __forceinline__ -void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) { +template +__device__ __forceinline__ void getTopK( + Pair topK[], real* src, int idx, int dim, int beamSize) { while (idx < dim) { if (topK[beamSize - 1] < src[idx]) { Pair tmp(src[idx], idx); @@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) { } } -template -__device__ __forceinline__ -void getTopK(Pair topK[], real *src, int idx, int dim, - const Pair& max, int beamSize) { +template +__device__ __forceinline__ void getTopK( + Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) { while (idx < dim) { if (topK[beamSize - 1] < src[idx]) { Pair tmp(src[idx], idx); @@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, } } -template -__device__ __forceinline__ -void getTopK(Pair topK[], real *val, int *col, - int idx, int dim, int beamSize) { +template +__device__ __forceinline__ void getTopK( + Pair topK[], real* val, int* col, int idx, int dim, int beamSize) { while (idx < dim) { if (topK[beamSize - 1] < val[idx]) { Pair tmp(val[idx], col[idx]); @@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col, } } -template -__device__ __forceinline__ -void getTopK(Pair topK[], real *val, int *col, int idx, int dim, - const Pair& max, int beamSize) { +template +__device__ __forceinline__ void getTopK(Pair topK[], + real* val, + int* col, + int idx, + int dim, + const Pair& max, + int beamSize) { while (idx < dim) { if (topK[beamSize - 1] < val[idx]) { Pair tmp(val[idx], col[idx]); @@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim, } } -template -__device__ __forceinline__ -void threadGetTopK(Pair topK[], int& beam, int beamSize, - real* src, - bool& firstStep, bool& isEmpty, Pair& max, - int dim, const int tid) { +template +__device__ __forceinline__ void threadGetTopK(Pair topK[], + int& beam, + int beamSize, + real* src, + bool& firstStep, + bool& isEmpty, + Pair& max, + int dim, + const int tid) { if (beam > 0) { int length = beam < beamSize ? beam : beamSize; if (firstStep) { @@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, } } if (!isEmpty) { - getTopK(topK + maxLength - beam, src, tid, dim, - max, length); + getTopK(topK + maxLength - beam, src, tid, dim, max, length); } } @@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, } } -template -__device__ __forceinline__ -void threadGetTopK(Pair topK[], int& beam, int beamSize, - real* val, int* col, - bool& firstStep, bool& isEmpty, Pair& max, - int dim, const int tid) { +template +__device__ __forceinline__ void threadGetTopK(Pair topK[], + int& beam, + int beamSize, + real* val, + int* col, + bool& firstStep, + bool& isEmpty, + Pair& max, + int dim, + const int tid) { if (beam > 0) { int length = beam < beamSize ? beam : beamSize; if (firstStep) { @@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, } } if (!isEmpty) { - getTopK(topK + maxLength - beam, val, col, tid, dim, - max, length); + getTopK( + topK + maxLength - beam, val, col, tid, dim, max, length); } } @@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, } } -template -__device__ __forceinline__ -void blockReduce(Pair* shTopK, int* maxId, Pair topK[], - real** topVal, int** topIds, - int& beam, int& beamSize, - const int tid, const int warp) { +template +__device__ __forceinline__ void blockReduce(Pair* shTopK, + int* maxId, + Pair topK[], + real** topVal, + int** topIds, + int& beam, + int& beamSize, + const int tid, + const int warp) { while (true) { __syncthreads(); if (tid < blockSize / 2) { @@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[], } } __syncthreads(); - for (int stride = blockSize / 4; stride > 0; stride = stride/2) { + for (int stride = blockSize / 4; stride > 0; stride = stride / 2) { if (tid < stride) { if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) { maxId[tid] = maxId[tid + stride]; @@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[], * 3. go to the second setp, until one thread's topK value is null; * 4. go to the first setp, until get the topK value. */ -template -__global__ void KeMatrixTopK(real* topVal, int ldv, - int * topIds, - real* src, int lds, +template +__global__ void KeMatrixTopK(real* topVal, + int ldv, + int* topIds, + real* src, + int lds, int dim, int beamSize) { __shared__ Pair shTopK[blockSize]; @@ -271,7 +279,7 @@ __global__ void KeMatrixTopK(real* topVal, int ldv, topVal += blockIdx.x * ldv; topIds += blockIdx.x * beamSize; - Pair topK[maxLength]; // NOLINT + Pair topK[maxLength]; // NOLINT int beam = maxLength; Pair max; bool isEmpty = false; @@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv, topK[k].set(-HL_FLOAT_MAX, -1); } while (beamSize) { - threadGetTopK - (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); + threadGetTopK( + topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); shTopK[tid] = topK[0]; - blockReduce - (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); + blockReduce( + shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); } } -template -__global__ void KeSMatrixTopK(real* topVal, int ldv, - int * topIds, +template +__global__ void KeSMatrixTopK(real* topVal, + int ldv, + int* topIds, real* val, int* row, int* col, @@ -304,7 +313,7 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv, topVal += blockIdx.x * ldv; topIds += blockIdx.x * beamSize; - Pair topK[maxLength]; // NOLINT + Pair topK[maxLength]; // NOLINT int beam = maxLength; Pair max; bool isEmpty = false; @@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv, topK[k].set(-HL_FLOAT_MAX, -1); } while (beamSize) { - threadGetTopK - (topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid); + threadGetTopK( + topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid); shTopK[tid] = topK[0]; - blockReduce - (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); + blockReduce( + shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); } } -void hl_matrix_top_k(real* topVal, int ldv, - int * topIds, - real* src, int lds, +void hl_matrix_top_k(real* topVal, + int ldv, + int* topIds, + real* src, + int lds, int dim, int beamSize, int numSamples) { @@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv, dim3 threads(256, 1); dim3 grid(numSamples, 1); - KeMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>> - (topVal, ldv, topIds, src, lds, dim, beamSize); + KeMatrixTopK<5, 256><<>>( + topVal, ldv, topIds, src, lds, dim, beamSize); CHECK_SYNC("hl_matrix_top_k failed"); } -void hl_sparse_matrix_top_k(real* topVal, int ldv, - int * topIds, +void hl_sparse_matrix_top_k(real* topVal, + int ldv, + int* topIds, hl_sparse_matrix_s src, int beamSize, int numSamples) { CHECK_NOTNULL(topVal); CHECK_NOTNULL(topIds); CHECK_NOTNULL(src); - CHECK_EQ(src->format, HL_SPARSE_CSR) - <<"sparse matrix format error!"; + CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!"; hl_csr_matrix csr = (hl_csr_matrix)src->matrix; - if (csr->csr_val == NULL || csr->csr_row == NULL || - csr->csr_col == NULL) { + if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) { LOG(FATAL) << "parameter src is null!"; } dim3 threads(256, 1); dim3 grid(numSamples, 1); - KeSMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>> - (topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize); + KeSMatrixTopK<5, 256><<>>( + topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize); CHECK_SYNC("hl_sparse_matrix_top_k failed"); } @@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv, * 3. go to the second setp, until one thread's topK value is null; * 4. go to the first setp, until get the topK value. */ -template -__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, - int * topIds, - real* src, int lds, +template +__global__ void KeMatrixTopKClassificationError(real* topVal, + int ldv, + int* topIds, + real* src, + int lds, int dim, int beamSize, int* label, @@ -408,7 +420,7 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, topVal += blockIdx.x * ldv; topIds += blockIdx.x * beamSize; - Pair topK[maxLength]; // NOLINT + Pair topK[maxLength]; // NOLINT int beam = maxLength; Pair max; bool isEmpty = false; @@ -420,34 +432,36 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, } while (beamSize) { - threadGetTopK - (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); + threadGetTopK( + topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); shTopK[tid] = topK[0]; - blockReduce - (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); + blockReduce( + shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); } __syncthreads(); if (tid == 0) { for (int i = 0; i < topkSize; i++) { - if (*--topIds == label[blockIdx.x]) { - recResult[blockIdx.x] = 0; - break; - } - recResult[blockIdx.x] = 1.0f; + if (*--topIds == label[blockIdx.x]) { + recResult[blockIdx.x] = 0; + break; + } + recResult[blockIdx.x] = 1.0f; } } } -void hl_matrix_classification_error(real* topVal, int ldv, - int* topIds, - real* src, int lds, - int dim, - int topkSize, - int numSamples, - int* label, - real* recResult) { +void hl_matrix_classification_error(real* topVal, + int ldv, + int* topIds, + real* src, + int lds, + int dim, + int topkSize, + int numSamples, + int* label, + real* recResult) { CHECK_NOTNULL(topVal); CHECK_NOTNULL(topIds); CHECK_NOTNULL(src); @@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv, dim3 threads(256, 1); dim3 grid(numSamples, 1); - KeMatrixTopKClassificationError<5, 256> - <<< grid, threads, 0, STREAM_DEFAULT >>> - (topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult); + KeMatrixTopKClassificationError<5, 256><<>>( + topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult); CHECK_SYNC("hl_matrix_top_k classification error failed"); } diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 21cb7c7265e0052630b68954fa25f9189e641e7b..1db042c6fc8b6c4ea7c3854ea4b1cd016deeb0b6 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -8,15 +8,19 @@ cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) cc_test(variable_test SRCS variable_test.cc) -cc_test(scope_test SRCS scope_test.cc) -proto_library(attr_type SRCS attr_type.proto) -proto_library(op_proto SRCS op_proto.proto DEPS attr_type) -proto_library(op_desc SRCS op_desc.proto DEPS attr_type) +cc_library(scope SRCS scope.cc) +cc_test(scope_test SRCS scope_test.cc DEPS scope) + +proto_library(attribute_proto SRCS attribute.proto) +proto_library(op_proto SRCS op_proto.proto DEPS attribute_proto) +proto_library(op_desc SRCS op_desc.proto DEPS attribute_proto) cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf) cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf) -cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor) +cc_library(attribute SRCS attribute.cc DEPS op_desc op_proto) + +cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope attribute) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry) cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator) @@ -24,10 +28,19 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder) cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op) -py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto) +py_proto_compile(framework_py_proto SRCS attribute.proto op_proto.proto op_desc.proto) # Generate an empty __init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) -cc_library(net SRCS net.cc DEPS op_registry) -cc_test(net_op_test SRCS net_op_test.cc DEPS net add_op mul_op sigmoid_op softmax_op fc_op) +cc_library(backward SRCS backward.cc DEPS net_op) +cc_test(backward_test SRCS backward_test.cc DEPS backward) +cc_library(paddle_pybind SHARED + SRCS pybind.cc + DEPS pybind python backward + fc_op + sgd_op + add_op + mean_op + cross_entropy_op + recurrent_op) diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc new file mode 100644 index 0000000000000000000000000000000000000000..4c5790693b7e48396e945d09f4fdc72b86aa5978 --- /dev/null +++ b/paddle/framework/attribute.cc @@ -0,0 +1,85 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/attribute.h" + +#include + +namespace paddle { +namespace framework { + +template <> +AttrType AttrTypeID() { + return INT; +} +template <> +AttrType AttrTypeID() { + return FLOAT; +} +template <> +AttrType AttrTypeID() { + return STRING; +} +template <> +AttrType AttrTypeID>() { + return INTS; +} +template <> +AttrType AttrTypeID>() { + return FLOATS; +} +template <> +AttrType AttrTypeID>() { + return STRINGS; +} + +Attribute GetAttrValue(const AttrDesc& attr_desc) { + switch (attr_desc.type()) { + case paddle::framework::AttrType::INT: { + return attr_desc.i(); + } + case paddle::framework::AttrType::FLOAT: { + return attr_desc.f(); + } + case paddle::framework::AttrType::STRING: { + return attr_desc.s(); + } + case paddle::framework::AttrType::INTS: { + std::vector val(attr_desc.ints_size()); + for (int i = 0; i < attr_desc.ints_size(); ++i) { + val[i] = attr_desc.ints(i); + } + return val; + } + case paddle::framework::AttrType::FLOATS: { + std::vector val(attr_desc.floats_size()); + for (int i = 0; i < attr_desc.floats_size(); ++i) { + val[i] = attr_desc.floats(i); + } + return val; + } + case paddle::framework::AttrType::STRINGS: { + std::vector val(attr_desc.strings_size()); + for (int i = 0; i < attr_desc.strings_size(); ++i) { + val[i] = attr_desc.strings(i); + } + return val; + } + } + PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !"); + return boost::blank(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/attr_checker.h b/paddle/framework/attribute.h similarity index 79% rename from paddle/framework/attr_checker.h rename to paddle/framework/attribute.h index ea5614a45f3a77a851358aff80abbc276c9972ba..3a5820e9c60539e3c771df5da4e82f6c1cae688f 100644 --- a/paddle/framework/attr_checker.h +++ b/paddle/framework/attribute.h @@ -1,3 +1,17 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #pragma once #include @@ -6,6 +20,9 @@ #include #include #include + +#include "paddle/framework/attribute.pb.h" +#include "paddle/framework/op_desc.pb.h" #include "paddle/platform/enforce.h" namespace paddle { @@ -14,13 +31,19 @@ namespace framework { typedef boost::variant, std::vector, std::vector> Attribute; + typedef std::unordered_map AttributeMap; +template +AttrType AttrTypeID(); + +Attribute GetAttrValue(const AttrDesc& attr_desc); + // check whether a value(attribute) fit a certain limit template class LargerThanChecker { public: - LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {} + explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {} void operator()(T& value) const { PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail"); } @@ -35,7 +58,8 @@ class LargerThanChecker { template class DefaultValueSetter { public: - DefaultValueSetter(T default_value) : default_value_(default_value) {} + explicit DefaultValueSetter(T default_value) + : default_value_(default_value) {} void operator()(T& value) const { value = default_value_; } private: @@ -78,7 +102,8 @@ class TypedAttrChecker { typedef std::function ValueChecker; public: - TypedAttrChecker(const std::string& attr_name) : attr_name_(attr_name) {} + explicit TypedAttrChecker(const std::string& attr_name) + : attr_name_(attr_name) {} TypedAttrChecker& InEnum(const std::unordered_set& range) { value_checkers_.push_back(EnumInContainer(range)); diff --git a/paddle/framework/attr_type.proto b/paddle/framework/attribute.proto similarity index 88% rename from paddle/framework/attr_type.proto rename to paddle/framework/attribute.proto index 2d8e0476d710b7ba987d085d828ca13a4ee23707..13ae312c10e934566384b8bd0f41dacd6c01fc2f 100644 --- a/paddle/framework/attr_type.proto +++ b/paddle/framework/attribute.proto @@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -syntax="proto2"; +syntax = "proto2"; package paddle.framework; // Attribute Type for paddle's Op. // Op contains many attributes. Each type of attributes could be different. // The AttrType will be shared between AttrDesc and AttrProto. enum AttrType { - INT = 0; - FLOAT = 1; - STRING = 2; - INTS = 3; - FLOATS = 4; - STRINGS = 5; + INT = 0; + FLOAT = 1; + STRING = 2; + INTS = 3; + FLOATS = 4; + STRINGS = 5; } \ No newline at end of file diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc new file mode 100644 index 0000000000000000000000000000000000000000..13706f8b562a1d68fe0d603f51c2fb47b4e18164 --- /dev/null +++ b/paddle/framework/backward.cc @@ -0,0 +1,179 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/backward.h" +#include +#include "paddle/framework/op_registry.h" +#include "paddle/operators/net_op.h" + +namespace paddle { +namespace framework { + +static bool AllInSet(const std::vector& names, + const std::string& suffix, + const std::unordered_set& set) { + for (auto& name : names) { + if (set.find(name + suffix) == set.end()) { + return false; + } + } + return true; +} + +static std::shared_ptr NOP() { + auto net_op = std::make_shared(); + net_op->type_ = "@NOP@"; + net_op->CompleteAddOp(); + return net_op; +} + +// Get backward operator from a forward operator, recursively implementation. +// +// no_grad_names the gradient variable names without gradient calculating. +// +// uniq_id is a unique index used inside recursively calling +// BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and +// pass `uniq_id` through recursive calling. +// +// returns The backward operator. For simple situation, it is a simple +// operator. For complex situation, it is a NetOp. +// +// See Backward.h for details +static std::shared_ptr BackwardRecursive( + const OperatorBase& forwardOp, + std::unordered_set& no_grad_names, size_t& uniq_id); +std::shared_ptr BackwardRecursive( + const OperatorBase& forwardOp, + std::unordered_set& no_grad_names, size_t& uniq_id) { + // If all input gradients of forwarding operator do not need to calculate, + // just return an NOP. Not return null ptr because NOP does not take + // too much time for calculation, but it is useful for simplifying logic. + if (AllInSet(forwardOp.inputs_, kGradVarSuffix, no_grad_names)) { + return NOP(); + } + + // All output gradients of forwarding operator do not need to calculate. + // Then all input gradients cannot be computed at all, and we put them into + // `no_grad_names` set. Return an NOP. + if (AllInSet(forwardOp.outputs_, kGradVarSuffix, no_grad_names)) { + for (auto& name : forwardOp.inputs_) { + // Mark all input is not need + no_grad_names.insert(name + kGradVarSuffix); + } + return NOP(); + } + + // Returned gradient network + auto net = std::make_shared(); + + if (forwardOp.IsNetOp()) { + // Because forwardOp is a net op, it can static_cast. + auto& forwardNet = static_cast(forwardOp); + + // Map from output gradient variable name to operator's indices in + // backward net. That operator generates that variable. + std::unordered_map> dup_output_ops; + + size_t local_op_id = 0; + // reversely travel forwardNet + for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend(); + ++it, ++local_op_id) { + auto fwd = *it; + auto bwd = BackwardRecursive(*fwd, no_grad_names, uniq_id); + net->AddOp(bwd); + for (auto& out : bwd->outputs_) { + dup_output_ops[out].emplace_back(local_op_id); + } + } + // Get unique ID for this method. + auto uid = uniq_id++; + // TODO(dzh): more comment + using Pos = std::pair>; + std::list insert_position; + for (auto& dup_output_op : dup_output_ops) { + const std::string& name = dup_output_op.first; + auto& dup_op = dup_output_op.second; + if (dup_op.size() == 1) continue; + std::vector dup_outputs; + + for (size_t i = 0; i < dup_op.size(); ++i) { + auto op_offset = dup_op[i]; + dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" + + std::to_string(i)); + net->ops_[op_offset]->Rename(name, dup_outputs.back()); + } + insert_position.push_back( + {dup_op.back(), + OpRegistry::CreateOp( + "add", {dup_outputs}, {name}, + {{"input_format", + std::vector{0, static_cast(dup_outputs.size())}}})}); + } + + insert_position.sort( + [](const Pos& l, const Pos& r) { return l.first > r.first; }); + + for (auto& pos : insert_position) { + net->InsertOp(pos.first + 1, pos.second); + } + + } else { + std::shared_ptr grad_op = OpRegistry::CreateGradOp(forwardOp); + for (std::string& grad_input : grad_op->inputs_) { + if (no_grad_names.count(grad_input)) { + std::string prefix = + grad_input.substr(0, grad_input.size() - kGradVarSuffix.size()); + grad_input = prefix + kZeroVarSuffix; + + // If part of input gradient of that operator is not calculated, fill + // zero variables to that input gradient. + net->AddOp(OpRegistry::CreateOp("fill_zeros_like", {prefix}, + {grad_input}, {})); + } + } + + for (std::string& grad_output : grad_op->outputs_) { + if (no_grad_names.count(grad_output)) { + grad_output = kEmptyVarName; + } + } + + if (net->ops_.empty()) { // Current no aux op is added to network + return grad_op; + } + net->AddOp(grad_op); + } + net->type_ = "@GENERATED_BACKWARD@"; + net->CompleteAddOp(); + return net; +} + +// See header for comments +std::shared_ptr Backward( + const OperatorBase& forwardOp, + const std::unordered_set& no_grad_vars) { + std::unordered_set no_grad_names; + no_grad_names.reserve(no_grad_vars.size()); + + no_grad_names.insert(kEmptyVarName + kGradVarSuffix); + + for (auto& name : no_grad_vars) { + no_grad_names.insert(name + kGradVarSuffix); + } + size_t uid = 0; + return BackwardRecursive(forwardOp, no_grad_names, uid); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h new file mode 100644 index 0000000000000000000000000000000000000000..c181919dc165cf0b49362f85e22ceb4131bbd387 --- /dev/null +++ b/paddle/framework/backward.h @@ -0,0 +1,27 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include "operator.h" +namespace paddle { +namespace framework { + +// Create the backward operator from a forward operator. +// TODO(yuyang18): Add more API reference comment. +extern std::shared_ptr Backward( + const OperatorBase& forwardOp, + const std::unordered_set& no_grad_vars); +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/backward.md b/paddle/framework/backward.md new file mode 100644 index 0000000000000000000000000000000000000000..74c001b06a9e7b2279abf998604f2acf1b1168e4 --- /dev/null +++ b/paddle/framework/backward.md @@ -0,0 +1,38 @@ +## Operator/expression 's Backward + +### Motivation + +In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation lineage, the operator/ expression's Backward feature will generate the backward pass respect to forward pass. + +### Implement : gradient operator registry + +| | forward operator | backward operator | +| ---------------------- | ---------------- | -------------------------------- | +| **Operator::inputs_** | Inputs | Inputs, Outputs, OutputGradients | +| **Operator::outputs_** | Outputs | InputGradients | + +Inputs/Outputs means the input/output of the operator, InputGradients/OutputGradients is the gradient respect to forward opeartor. Forward operator and Backward operator are isomorphic, save their corresponding needs into member attribute. + +We use a global hash map record the gradient operators available, follow the philosophy of minimum core, make operator pluggable unit. Each gradient is an operator and it needs to regist itself. + +grad_op_builder(fengjiayi) + +### Implement : Backward network + +given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`. + +1. bla bla bla (yuyang) + +2. NetOp + + when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively and ensure them done. During the process, we need to collect the `OutputGradients` name. + + We share variable in the same scope, as a result, duplicate operator `OutputGradients` will overwirte then duplicate variable. + + ![./images/duplicate_op]() + + Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator instead. + +![./images/duplicate_op2]() + +​ Then collect the sub graph OutputGradients/InputGradients as the NetOp's and return it. diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c6e12ca254553a8fc02cadbe3a99989ee848943 --- /dev/null +++ b/paddle/framework/backward_test.cc @@ -0,0 +1,363 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/backward.h" + +#include +#include "paddle/framework/op_registry.h" +#include "paddle/operators/net_op.h" +#include "paddle/operators/type_alias.h" + +namespace paddle { +namespace framework { + +class EmptyOp : public OperatorBase { + public: + void InferShape(const Scope &scope) const override {} + void Run(const Scope &scope, + const platform::DeviceContext &dev_ctx) const override {} +}; + +class RowWiseAddOpMaker : public OpProtoAndCheckerMaker { + public: + RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input X of Add").IgnoreGradient(); + AddInput("b", "Bias of Add").IgnoreGradient(); + AddOutput("Out", "Out of Add").IgnoreGradient(); + AddComment("Add Op"); + } +}; + +class MulOpMaker : public OpProtoAndCheckerMaker { + public: + MulOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("A", "A"); + AddInput("B", "B"); + AddOutput("Out", "Out"); + AddComment("Mul"); + } +}; + +class SigmoidOpMaker : public OpProtoAndCheckerMaker { + public: + SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "X"); + AddOutput("Y", "Y"); + AddComment("Sigmoid"); + } +}; + +class NoGradOpMaker : public OpProtoAndCheckerMaker { + public: + NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "X input"); + AddOutput("Y", "Y output"); + AddComment("NoGradOp, same input output. no Grad"); + } +}; + +class FcOp : public ops::NetOp { + public: + void Init() override { + AddOp(OpRegistry::CreateOp("mul", {Input("X"), Input("W")}, + {Output("mul_result")}, {})); + auto b_name = Input("b"); + std::string before_act = "mul_result"; + if (b_name != kEmptyVarName) { + AddOp(OpRegistry::CreateOp("rowwise_add", {Output("mul_result"), b_name}, + {Output("add_result")}, {})); + before_act = "add_result"; + } else { + auto out_varname = Output("add_result"); + if (out_varname != kEmptyVarName) { + this->Rename(out_varname, kEmptyVarName); + } + } + + AddOp(OpRegistry::CreateOp("sigmoid", {Output(before_act)}, {Output("Out")}, + {})); + CompleteAddOp(false); + } +}; + +class FcOpMaker : public OpProtoAndCheckerMaker { + public: + FcOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "x"); + AddInput("W", "w"); + AddInput("b", "b"); + AddOutput("mul_result", "").SetTemporary(); + AddOutput("add_result", "").SetTemporary(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class ManyOutputOpMaker : public OpProtoAndCheckerMaker { + public: + ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("x", "x"); + AddOutput("y", "y"); + AddOutput("z", "z"); + AddComment(""); + } +}; + +class FillZeroOpMaker : public OpProtoAndCheckerMaker { + public: + FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("x", "x"); + AddOutput("out", "out"); + AddComment(""); + } +}; + +class AddOpMaker : public OpProtoAndCheckerMaker { + public: + AddOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "x").SetMultiple(); + AddOutput("Y", "y"); + AddComment(""); + } +}; +} // namespace framework +} // namespace paddle + +namespace f = paddle::framework; +using EnforceNotMet = paddle::platform::EnforceNotMet; +REGISTER_OP(rowwise_add, f::EmptyOp, f::RowWiseAddOpMaker); +REGISTER_GRADIENT_OP(rowwise_add, rowwise_add_grad, f::EmptyOp); +REGISTER_OP(mul, f::EmptyOp, f::MulOpMaker); +REGISTER_GRADIENT_OP(mul, mul_grad, f::EmptyOp); +REGISTER_OP(sigmoid, f::EmptyOp, f::SigmoidOpMaker); +REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, f::EmptyOp); +REGISTER_OP(nograd, f::EmptyOp, f::NoGradOpMaker); +REGISTER_OP(fill_zeros_like, f::EmptyOp, f::FillZeroOpMaker); +REGISTER_OP(add, f::EmptyOp, f::AddOpMaker); +REGISTER_GRADIENT_OP(add, add_grad, f::EmptyOp); +REGISTER_OP(fc, f::FcOp, f::FcOpMaker); +REGISTER_OP(many_output_op, f::EmptyOp, f::ManyOutputOpMaker); +REGISTER_GRADIENT_OP(many_output_op, many_output_op_grad, f::EmptyOp); + +TEST(Backward, simple_op_grad) { + auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {}); + ASSERT_NE(fwd, nullptr); + auto gop = f::OpRegistry::CreateGradOp(*fwd); + ASSERT_EQ(4UL, gop->inputs_.size()); + ASSERT_EQ(f::kEmptyVarName, gop->inputs_[0]); + ASSERT_EQ("rowwise_add_grad", gop->type_); + ASSERT_EQ("X" + f::kGradVarSuffix, gop->outputs_[0]); + ASSERT_EQ("b" + f::kGradVarSuffix, gop->outputs_[1]); + + ASSERT_EQ("X" + f::kGradVarSuffix, gop->Output("X" + f::kGradVarSuffix)); +} + +TEST(Backward, simple_op_not_need_grad) { + auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {}); + ASSERT_NE(fwd, nullptr); + auto gop = f::Backward(*fwd, {"X"}); + ASSERT_EQ(std::find(gop->outputs_.begin(), gop->outputs_.end(), + "X" + f::kGradVarSuffix), + gop->outputs_.end()); + + auto no_input_gop = f::Backward(*fwd, {"X", "b"}); + ASSERT_NE(no_input_gop, nullptr); + ASSERT_TRUE(no_input_gop->IsNetOp()); + ASSERT_EQ(0UL, + std::static_pointer_cast(no_input_gop)->ops_.size()); +} + +TEST(Backward, net_fc_backward_normal) { + std::shared_ptr fwd = f::OpRegistry::CreateOp( + "fc", {"X", "w", "b"}, {"mul_result", "add_result", "out"}, {}); + ASSERT_NE(fwd, nullptr); + std::shared_ptr gop = f::Backward(*fwd, {}); + ASSERT_TRUE(gop->IsNetOp()); + auto net = static_cast(gop.get()); + + ASSERT_NO_THROW(net->DebugString()); + + ASSERT_EQ(3UL, net->ops_.size()); + + f::OperatorBase &d_sigmoid = *net->ops_[0]; + ASSERT_EQ("sigmoid_grad", d_sigmoid.type_); + + f::OperatorBase &d_add = *net->ops_[1]; + ASSERT_EQ("rowwise_add_grad", d_add.type_); + + f::OperatorBase &d_mul = *net->ops_[2]; + ASSERT_EQ("mul_grad", d_mul.type_); +} + +TEST(Backward, net_fc_backward_not_have_b) { + std::shared_ptr fwd = + f::OpRegistry::CreateOp("fc", {"X", "w", f::kEmptyVarName}, + {"mul_result", "add_result", "tmp"}, {}); + ASSERT_NE(fwd, nullptr); + std::shared_ptr gop = f::Backward(*fwd, {}); + ASSERT_TRUE(gop->IsNetOp()); + auto net = static_cast(gop.get()); + + ASSERT_NO_THROW(net->DebugString()); + + ASSERT_EQ(2UL, net->ops_.size()); + + f::OperatorBase &d_sigmoid = *net->ops_[0]; + ASSERT_EQ("sigmoid_grad", d_sigmoid.type_); + + f::OperatorBase &d_mul = *net->ops_[1]; + ASSERT_EQ("mul_grad", d_mul.type_); +} + +TEST(Backward, net_input_of_network_not_need_grad) { + ops::NetOp net; + net.AddOp(f::OpRegistry::CreateOp("fc", {"X", "W1", "b1"}, + {"mul_tmp_0", "add_tmp_0", "hidden0"}, {})); + net.AddOp(f::OpRegistry::CreateOp("fc", {"hidden0", "W2", "b2"}, + {"mul_tmp_1", "add_tmp_1", "hidden1"}, {})); + net.CompleteAddOp(); + auto bwd = Backward(net, {"X"}); // X@GRAD is not need. + ASSERT_TRUE(bwd->IsNetOp()); + auto bwd_net = static_cast(bwd.get()); + + std::unordered_set all_output = std::unordered_set( + bwd_net->outputs_.begin(), bwd_net->outputs_.end()); + all_output.erase(f::kEmptyVarName); + + for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) { + ASSERT_NE(all_output.find(out + f::kGradVarSuffix), all_output.end()); + } + + // Not Generated X + ASSERT_EQ(all_output.find("X" + f::kGradVarSuffix), all_output.end()); + + ASSERT_EQ(2UL, bwd_net->ops_.size()); + ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp()); + auto first_fc_grad = static_cast(bwd_net->ops_[1].get()); + ASSERT_EQ(3UL, first_fc_grad->ops_.size()); + ASSERT_EQ(f::kEmptyVarName, + first_fc_grad->ops_[2]->Output("A" + f::kGradVarSuffix)); +} + +TEST(Backward, net_shared_weight) { + ops::NetOp net; + net.AddOp(f::OpRegistry::CreateOp("mul", {"X", "W"}, {"Out"}, {})); + net.AddOp(f::OpRegistry::CreateOp("mul", {"Out", "W"}, {"FinalOut"}, {})); + net.CompleteAddOp(); + + auto bwd = f::Backward(net, {}); + ASSERT_TRUE(bwd->IsNetOp()); + auto bwd_net = static_cast(bwd.get()); + ASSERT_EQ(3UL, bwd_net->ops_.size()); + ASSERT_EQ("add", bwd_net->ops_[2]->type_); +} + +TEST(Backward, op_register_grad_not_for_network) { + auto fwd = f::OpRegistry::CreateOp( + "fc", {"X", "W", "b"}, {"mul_out", "add_out", "out1"}, + {{"temporary_index", std::vector{0, 1}}}); + + ASSERT_THROW(f::OpRegistry::CreateGradOp(*fwd), EnforceNotMet); +} + +TEST(Backward, op_all_input_are_not_need) { + auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {}); + auto backward = f::Backward(*fwd, {"X", "b"}); + ASSERT_TRUE(backward->IsNetOp()); + auto net = static_cast(backward.get()); + ASSERT_TRUE(net->ops_.empty()); +} + +TEST(Backward, op_all_output_are_not_need) { + auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {}); + auto backward = f::Backward(*fwd, {"Out"}); + ASSERT_TRUE(backward->IsNetOp()); + auto net = static_cast(backward.get()); + ASSERT_TRUE(net->ops_.empty()); +} + +TEST(Backward, op_part_of_output_are_not_need) { + auto fwd = f::OpRegistry::CreateOp("many_output_op", {"X"}, {"Y", "Z"}, {}); + auto backward = f::Backward(*fwd, {"Z"}); + ASSERT_TRUE(backward->IsNetOp()); + auto net = static_cast(backward.get()); + ASSERT_EQ(net->ops_.size(), 2UL); + + auto &fill_zero = *net->ops_[0]; + ASSERT_EQ("fill_zeros_like", fill_zero.type_); + ASSERT_EQ(1UL, fill_zero.inputs_.size()); + ASSERT_EQ("Z", fill_zero.inputs_[0]); + ASSERT_EQ(1UL, fill_zero.outputs_.size()); + ASSERT_EQ("Z" + f::kZeroVarSuffix, fill_zero.outputs_[0]); + + auto &d_many_out = *net->ops_[1]; + ASSERT_EQ("many_output_op_grad", d_many_out.type_); + ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.inputs_.size()); // I/O/OG + ASSERT_EQ("Z" + f::kZeroVarSuffix, d_many_out.Input("z" + f::kGradVarSuffix)); + ASSERT_EQ("Y" + f::kGradVarSuffix, d_many_out.Input("y" + f::kGradVarSuffix)); + ASSERT_EQ("X" + f::kGradVarSuffix, + d_many_out.Output("x" + f::kGradVarSuffix)); +} + +TEST(Backward, op_part_of_input_are_not_need) { + auto fwd = f::OpRegistry::CreateOp("mul", {"a", "b"}, {"out"}, {}); + auto backward = f::Backward(*fwd, {"a"}); + auto &grad_mul = *backward; + ASSERT_EQ(grad_mul.type_, "mul_grad"); + ASSERT_EQ(grad_mul.inputs_.size(), 2UL + 1UL + 1UL); + ASSERT_EQ(grad_mul.outputs_.size(), 2UL); + ASSERT_EQ(grad_mul.Output("A" + f::kGradVarSuffix), f::kEmptyVarName); + ASSERT_EQ(grad_mul.Output("B" + f::kGradVarSuffix), "b" + f::kGradVarSuffix); + ASSERT_EQ(grad_mul.Input("Out" + f::kGradVarSuffix), + "out" + f::kGradVarSuffix); + ASSERT_EQ(grad_mul.Input("A"), "a"); + ASSERT_EQ(grad_mul.Input("B"), "b"); + ASSERT_EQ(grad_mul.Input("Out"), "out"); +} + +TEST(Backward, linear_net_intermediate_variable_has_no_grad) { + ops::NetOp net; + net.AddOp(f::OpRegistry::CreateOp("fc", {"x1", "w1", "b1"}, + {"mul_out1", "add_out1", "out1"}, {})); + net.AddOp(f::OpRegistry::CreateOp("fc", {"out1", "w2", "b2"}, + {"mul_out2", "tmp_out2", "out2"}, {})); + net.AddOp(f::OpRegistry::CreateOp("fc", {"out2", "w3", "b3"}, + {"mul_out3", "tmp_out3", "out3"}, {})); + net.CompleteAddOp(); + auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"}); + ASSERT_TRUE(backward->IsNetOp()); + auto bwd_net = static_cast(backward.get()); + ASSERT_EQ(bwd_net->ops_.size(), 3UL); + auto &grad_fc = *bwd_net->ops_[0]; + EXPECT_EQ(grad_fc.inputs_.size(), + 3UL /* external input number */ + + 1UL /* external output number*/ + + 1UL /* number of gradient of external output*/ + + 2U /* internal variable number*/); + EXPECT_EQ(grad_fc.outputs_.size(), 2UL /* input number of mul*/ + + 2UL /* input number of rowwise_add */ + + 1UL /* input number of sigmod */); + EXPECT_EQ(bwd_net->ops_[1]->inputs_.size(), 0UL); + EXPECT_EQ(bwd_net->ops_[1]->outputs_.size(), 0UL); + EXPECT_EQ(bwd_net->ops_[2]->inputs_.size(), 0UL); + EXPECT_EQ(bwd_net->ops_[2]->outputs_.size(), 0UL); +} diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h index 9fcc657edcd5459d0a42a64d708603a4bcd53cf0..5aa5af0c19be5a209c760282cb1a090fc57a53ad 100644 --- a/paddle/framework/ddim.h +++ b/paddle/framework/ddim.h @@ -25,18 +25,15 @@ limitations under the License. */ namespace paddle { namespace framework { -namespace { -typedef boost::variant, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>, - Dim<8>, Dim<9>> - DDimVar; -} - /** * \brief A dynamically sized dimension. * * The number of dimensions must be between [1, 9]. */ struct DDim { + typedef boost::variant, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>, + Dim<8>, Dim<9>> + DDimVar; DDimVar var; DDim() : var(Dim<1>()) {} diff --git a/paddle/framework/eigen.h b/paddle/framework/eigen.h index 5f3358c69b3fbbbfcd97a96ab50fde3d8b9efad0..a4667cc51fadfc020d3211b7a82356db386fced1 100644 --- a/paddle/framework/eigen.h +++ b/paddle/framework/eigen.h @@ -80,5 +80,21 @@ struct EigenVector : public EigenTensor { } }; +template +struct EigenScalar { + // Scalar tensor (implemented as a rank-0 tensor) of scalar type T. + using Type = Eigen::TensorMap< + Eigen::TensorFixedSize, MajorType, IndexType>>; + using ConstType = Eigen::TensorMap< + Eigen::TensorFixedSize, MajorType, IndexType>>; + + static Type From(Tensor& tensor) { return Type(tensor.data()); } + + static ConstType From(const Tensor& tensor) { + return ConstType(tensor.data()); + } +}; + } // namespace framework } // namespace paddle diff --git a/paddle/framework/eigen_test.cc b/paddle/framework/eigen_test.cc index a9fa728e49a0dcc781e520a22c1ee5f921c4c733..dc1957691b1a202826e10e84c21ac8874df9e378 100644 --- a/paddle/framework/eigen_test.cc +++ b/paddle/framework/eigen_test.cc @@ -46,6 +46,17 @@ TEST(Eigen, Tensor) { } } +TEST(Eigen, ScalarFrom) { + Tensor t; + int* p = t.mutable_data(make_ddim({1}), platform::CPUPlace()); + *p = static_cast(100); + + EigenScalar::Type es = EigenScalar::From(t); + + ASSERT_EQ(0, es.dimension(0)); + ASSERT_EQ(100, es(0)); +} + TEST(Eigen, VectorFrom) { Tensor t; float* p = t.mutable_data(make_ddim({6}), platform::CPUPlace()); diff --git a/paddle/framework/grad_op_builder.cc b/paddle/framework/grad_op_builder.cc index 6235be75f27dadb65de663ff1b3caf26a649f6cb..6d032fb78f099f5142d64e531d1a03c10ed5e68e 100644 --- a/paddle/framework/grad_op_builder.cc +++ b/paddle/framework/grad_op_builder.cc @@ -8,108 +8,95 @@ You may obtain a copy of the License at Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +WITHOpArgType::OUT WARRANTIES OR CONDITIONS OF ANY KOpArgType::IND, either +express or implied. See the License for the specific language governing +permissions and limitations under the License. */ #include "paddle/framework/grad_op_builder.h" +#include "paddle/framework/op_proto.pb.h" #include "paddle/framework/op_registry.h" namespace paddle { namespace framework { -OperatorBase* GradOpBuilder::Build() { - BuildOpInOutArgList(); - std::string grad_op_type = OpRegistry::grad_ops().at(op_->type_); - OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)(); - grad_op->type_ = grad_op_type; - CompleteGradOp(grad_op); - return grad_op; -} +class OpRegistry; + +using VarIndexMap = std::unordered_map; -OpInOutArg* GradOpBuilder::BuildArg(const VarProto& var, - const VarIndexMap& var_map, - const std::vector& format, - InOutType type) { - int idx = var_map.at(var.name()); - int begin_idx = format.empty() ? idx : format.at(idx); - int end_idx = format.empty() ? idx + 1 : format.at(idx + 1); - return new OpInOutArg(var.name(), type, !var.ignore_gradient(), begin_idx, - end_idx); +enum class OpArgType { IN, OUT }; + +static std::vector* GetOpFormat(OperatorBase* op, const OpArgType& type) { + std::string key = type == OpArgType::IN ? "input_format" : "output_format"; + return op->attrs_.count(key) + ? &boost::get>(op->attrs_.at(key)) + : nullptr; } -void GradOpBuilder::BuildOpInOutArgList() { - const OpProto& op_proto = OpRegistry::protos().at(op_->type_); - const auto& var_map = *(OpRegistry::VarIndexMaps().at(op_->type_)); - const std::vector& in_format = - op_->attrs_.count("input_format") - ? op_->GetAttr>("input_format") - : std::vector(); - const std::vector& out_format = - op_->attrs_.count("output_format") - ? op_->GetAttr>("output_format") - : std::vector(); - for (const auto& var : op_proto.inputs()) { - arg_list_.emplace_back( - std::shared_ptr(BuildArg(var, var_map, in_format, IN))); - } - for (const auto& var : op_proto.outputs()) { - arg_list_.emplace_back( - std::shared_ptr(BuildArg(var, var_map, out_format, OUT))); - } +static const std::vector* GetOpFormat(const OperatorBase* op, + const OpArgType& type) { + std::string key = type == OpArgType::IN ? "input_format" : "output_format"; + return op->attrs_.count(key) + ? &boost::get>(op->attrs_.at(key)) + : nullptr; } -void GradOpBuilder::AddArgIntoGradOp(const OpInOutArg* arg, - std::vector& in_out, - std::vector& format, - VarIndexMap* varmap, int& idx, - bool is_grad) const { - std::string var_name = arg->proto_name_; - if (is_grad) { - var_name += OperatorBase::GRAD_VAR_SUFFIX(); - } - (*varmap)[var_name] = idx++; - size_t pre_sz = in_out.size(); - auto base_it = - arg->type_ == IN ? op_->inputs_.begin() : op_->outputs_.begin(); - std::copy(base_it + arg->begin_idx_, base_it + arg->end_idx_, - std::back_inserter(in_out)); - if (is_grad) { - for (size_t i = pre_sz; i < in_out.size(); ++i) { - in_out[i] += OperatorBase::GRAD_VAR_SUFFIX(); +static void TransOpArg(const OperatorBase* src_op, OperatorBase* dst_op, + const OpArgType& src_type, const OpArgType& dst_type, + int& idx, bool is_grad) { + const std::vector& src_inout = + src_type == OpArgType::IN ? src_op->inputs_ : src_op->outputs_; + const std::vector* src_format = GetOpFormat(src_op, src_type); + + std::vector& dst_inout = + dst_type == OpArgType::IN ? dst_op->inputs_ : dst_op->outputs_; + std::vector* dst_format = GetOpFormat(dst_op, dst_type); + const OpProto& proto = OpRegistry::protos().at(src_op->type_); + const auto& src_arg_list = + src_type == OpArgType::IN ? proto.inputs() : proto.outputs(); + + for (const auto& arg : src_arg_list) { + std::string src_name = arg.name(); + std::string dst_name = is_grad ? src_name + kGradVarSuffix : src_name; + (*dst_op->in_out_idxs_)[dst_name] = idx++; + int src_arg_idx = src_op->in_out_idxs_->at(src_name); + int src_begin = + src_format == nullptr ? src_arg_idx : src_format->at(src_arg_idx); + int src_end = src_format == nullptr ? src_arg_idx + 1 + : src_format->at(src_arg_idx + 1); + for (int i = src_begin; i < src_end; ++i) { + std::string s = + is_grad ? src_inout[i] + kGradVarSuffix + : (arg.ignore_gradient() ? kEmptyVarName : src_inout[i]); + dst_inout.emplace_back(s); + } + if (dst_format != nullptr) { + dst_format->push_back(dst_inout.size()); } } - format.push_back(in_out.size()); } -void GradOpBuilder::CompleteGradOp(OperatorBase* grad_op) const { - grad_op->attrs_ = op_->attrs_; +OperatorBase* BuildGradOp(const OperatorBase* op) { + std::string grad_op_type = OpRegistry::grad_ops().at(op->type_); + OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)(); + grad_op->type_ = grad_op_type; + grad_op->attrs_ = op->attrs_; grad_op->attrs_.erase("input_format"); grad_op->attrs_.erase("output_format"); - VarIndexMap* grad_varmap = new VarIndexMap(); + if (GetOpFormat(op, OpArgType::IN) != nullptr) { + grad_op->attrs_["output_format"] = std::vector({0}); + } + if (GetOpFormat(op, OpArgType::IN) != nullptr || + GetOpFormat(op, OpArgType::OUT) != nullptr) { + grad_op->attrs_["input_format"] = std::vector({0}); + } + grad_op->in_out_idxs_.reset(new VarIndexMap()); int in_idx = 0; int out_idx = 0; - std::vector in_format({0}); - std::vector out_format({0}); - for (const auto& arg : arg_list_) { - // op_'s inputs_ and outputs_ - if (arg->needed_in_grad_) { - AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap, - in_idx, false); - } - if (arg->type_ == IN) { - // gradients of op_'s inputs_ - AddArgIntoGradOp(arg.get(), grad_op->outputs_, out_format, grad_varmap, - out_idx, true); - } else { - // gradients of op_'s outputs_ - AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap, - in_idx, true); - } - } - grad_op->attrs_["input_format"] = in_format; - grad_op->attrs_["output_format"] = out_format; - grad_op->in_out_idxs_.reset(grad_varmap); + TransOpArg(op, grad_op, OpArgType::IN, OpArgType::IN, in_idx, false); // I + TransOpArg(op, grad_op, OpArgType::OUT, OpArgType::IN, in_idx, false); // G + TransOpArg(op, grad_op, OpArgType::OUT, OpArgType::IN, in_idx, true); // OG + TransOpArg(op, grad_op, OpArgType::IN, OpArgType::OUT, out_idx, true); // IG + return grad_op; } } // namespace framework diff --git a/paddle/framework/grad_op_builder.h b/paddle/framework/grad_op_builder.h index 2ecf39479b4f4a51f89cd500caf851897df0e599..998f8ebbb5f2f4fb8b7e938b5916afd0f8a7930d 100644 --- a/paddle/framework/grad_op_builder.h +++ b/paddle/framework/grad_op_builder.h @@ -1,48 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #pragma once -#include "paddle/framework/op_proto.pb.h" #include "paddle/framework/operator.h" namespace paddle { namespace framework { -class OpRegistry; - -enum InOutType { IN, OUT }; - -struct OpInOutArg { - OpInOutArg(const std::string& proto_name, const InOutType& type, - bool needed_in_grad, size_t begin_idx, size_t end_idx) - : proto_name_(proto_name), - type_(type), - needed_in_grad_(needed_in_grad), - begin_idx_(begin_idx), - end_idx_(end_idx) {} - - std::string proto_name_; - InOutType type_; - bool needed_in_grad_; - size_t begin_idx_; - size_t end_idx_; -}; - -class GradOpBuilder { - using VarIndexMap = std::unordered_map; - - public: - GradOpBuilder(const OperatorBase* op) : op_(op) {} - OperatorBase* Build(); - - private: - OpInOutArg* BuildArg(const VarProto& var, const VarIndexMap& var_map, - const std::vector& format, InOutType type); - void BuildOpInOutArgList(); - void AddArgIntoGradOp(const OpInOutArg* arg, std::vector& in_out, - std::vector& format, VarIndexMap* varmap, int& idx, - bool is_grad) const; - void CompleteGradOp(OperatorBase* grad_op) const; - const OperatorBase* op_; - std::vector> arg_list_; -}; + +OperatorBase* BuildGradOp(const OperatorBase* op); } // namespace framework } // namespace paddle diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc index 288a7841cd7c9212d8fa230e38d49dfc26e76256..cf7143eba4460e5619188b82ffe23db11a04a236 100644 --- a/paddle/framework/grad_op_builder_test.cc +++ b/paddle/framework/grad_op_builder_test.cc @@ -8,10 +8,49 @@ USE_OP(add_two); namespace paddle { namespace framework { +class NOP : public OperatorBase { + public: + void InferShape(const Scope &scope) const override {} + void Run(const Scope &scope, + const platform::DeviceContext &dev_ctx) const override {} +}; + +class MutiInOutOpMaker : public OpProtoAndCheckerMaker { + public: + MutiInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("In1", "a single input"); + AddInput("In2_mult", "a multiple input").SetMultiple(); + AddInput("In3", "another single input"); + AddOutput("Out1", "a single output"); + AddOutput("Out2_mult", "a multiple output").SetMultiple(); + AddComment("test op with multiple inputs and outputs"); + } +}; + +class IOIgnoredOpMaker : public OpProtoAndCheckerMaker { + public: + IOIgnoredOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("In1", "a single input"); + AddInput("In2_mult", "a multiple input").SetMultiple().IgnoreGradient(); + AddInput("In3_mult", "another multiple input").SetMultiple(); + AddOutput("Out1_mult", "a multiple output").SetMultiple(); + AddOutput("Out2", "a single output").IgnoreGradient(); + AddComment("op with inputs and outputs ignored in gradient calculating"); + } +}; + +} // namespace framework +} // namespace paddle + +namespace f = paddle::framework; + TEST(GradOpBuilder, AddTwo) { - std::shared_ptr add_op( - OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {})); - std::shared_ptr grad_add_op = OpRegistry::CreateGradOp(add_op); + std::shared_ptr add_op( + f::OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {})); + std::shared_ptr grad_add_op = + f::OpRegistry::CreateGradOp(*add_op); EXPECT_EQ(static_cast(grad_add_op->inputs_.size()), 4); EXPECT_EQ(static_cast(grad_add_op->outputs_.size()), 2); EXPECT_EQ(grad_add_op->Input("X"), "x"); @@ -22,5 +61,77 @@ TEST(GradOpBuilder, AddTwo) { EXPECT_EQ(grad_add_op->Output("Y@GRAD"), "y@GRAD"); } -} // namespace framework -} // namespace paddle \ No newline at end of file +REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker); +REGISTER_GRADIENT_OP(mult_io, mult_io_grad, f::NOP); +REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker); +REGISTER_GRADIENT_OP(io_ignored, io_ignored_grad, f::NOP); + +TEST(GradOpBuilder, MutiInOut) { + f::AttributeMap attrs{{"input_format", std::vector{0, 1, 4, 5}}, + {"output_format", std::vector{0, 1, 3}}}; + std::shared_ptr test_op(f::OpRegistry::CreateOp( + "mult_io", {"in1", "in2_1", "in2_2", "in2_3", "in3"}, + {"out1", "out2_1", "out2_2"}, attrs)); + std::shared_ptr grad_test_op = + f::OpRegistry::CreateGradOp(*test_op); + + ASSERT_EQ(grad_test_op->inputs_.size(), 5UL + 3UL + 3UL); + EXPECT_EQ(grad_test_op->Input("In1"), "in1"); + EXPECT_EQ(grad_test_op->Inputs("In2_mult"), + std::vector({"in2_1", "in2_2", "in2_3"})); + EXPECT_EQ(grad_test_op->Input("In3"), "in3"); + EXPECT_EQ(grad_test_op->Input("Out1"), "out1"); + EXPECT_EQ(grad_test_op->Inputs("Out2_mult"), + std::vector({"out2_1", "out2_2"})); + EXPECT_EQ(grad_test_op->Input("Out1" + f::kGradVarSuffix), + "out1" + f::kGradVarSuffix); + EXPECT_EQ(grad_test_op->Inputs("Out2_mult" + f::kGradVarSuffix), + std::vector( + {"out2_1" + f::kGradVarSuffix, "out2_2" + f::kGradVarSuffix})); + + ASSERT_EQ(grad_test_op->outputs_.size(), 5UL); + EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix), + "in1" + f::kGradVarSuffix); + EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix), + std::vector({"in2_1" + f::kGradVarSuffix, + "in2_2" + f::kGradVarSuffix, + "in2_3" + f::kGradVarSuffix})); + EXPECT_EQ(grad_test_op->Output("In3" + f::kGradVarSuffix), + "in3" + f::kGradVarSuffix); +} + +TEST(GradOpBuilder, IOIgnoredInGradient) { + f::AttributeMap attrs{{"input_format", std::vector{0, 1, 3, 5}}, + {"output_format", std::vector{0, 2, 3}}}; + std::shared_ptr test_op(f::OpRegistry::CreateOp( + "io_ignored", {"in1", "in2_1", "in2_2", "in3_1", "in3_2"}, + {"out1_1", "out1_2", "out2"}, attrs)); + std::shared_ptr grad_test_op = + f::OpRegistry::CreateGradOp(*test_op); + + // 'In2' and 'Out2' are ignored in gradient calculating + ASSERT_EQ(grad_test_op->inputs_.size(), 5UL + 3UL + 3UL); + EXPECT_EQ(grad_test_op->Input("In1"), "in1"); + EXPECT_EQ(grad_test_op->Inputs("In2_mult"), + std::vector({f::kEmptyVarName, f::kEmptyVarName})); + EXPECT_EQ(grad_test_op->Inputs("In3_mult"), + std::vector({"in3_1", "in3_2"})); + EXPECT_EQ(grad_test_op->Inputs("Out1_mult"), + std::vector({"out1_1", "out1_2"})); + EXPECT_EQ(grad_test_op->Input("Out2"), f::kEmptyVarName); + EXPECT_EQ(grad_test_op->Inputs("Out1_mult" + f::kGradVarSuffix), + std::vector( + {"out1_1" + f::kGradVarSuffix, "out1_2" + f::kGradVarSuffix})); + EXPECT_EQ(grad_test_op->Input("Out2" + f::kGradVarSuffix), + "out2" + f::kGradVarSuffix); + + ASSERT_EQ(grad_test_op->outputs_.size(), 5UL); + EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix), + "in1" + f::kGradVarSuffix); + EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix), + std::vector( + {"in2_1" + f::kGradVarSuffix, "in2_2" + f::kGradVarSuffix})); + EXPECT_EQ(grad_test_op->Outputs("In3_mult" + f::kGradVarSuffix), + std::vector( + {"in3_1" + f::kGradVarSuffix, "in3_2" + f::kGradVarSuffix})); +} diff --git a/paddle/framework/images/duplicate_op.graffle b/paddle/framework/images/duplicate_op.graffle new file mode 100644 index 0000000000000000000000000000000000000000..5979f792e252f028a615729215529c2be42d9165 Binary files /dev/null and b/paddle/framework/images/duplicate_op.graffle differ diff --git a/paddle/framework/images/duplicate_op.png b/paddle/framework/images/duplicate_op.png new file mode 100644 index 0000000000000000000000000000000000000000..f299c5d37f260a1bb0daec886f0a4ee1c1f31c92 Binary files /dev/null and b/paddle/framework/images/duplicate_op.png differ diff --git a/paddle/framework/images/duplicate_op2.graffle b/paddle/framework/images/duplicate_op2.graffle new file mode 100644 index 0000000000000000000000000000000000000000..2b658085d6a55d368c320051ba7f94ec2900f13c Binary files /dev/null and b/paddle/framework/images/duplicate_op2.graffle differ diff --git a/paddle/framework/images/duplicate_op2.png b/paddle/framework/images/duplicate_op2.png new file mode 100644 index 0000000000000000000000000000000000000000..c5588015d1450fd8c1bda3580680d884494868bb Binary files /dev/null and b/paddle/framework/images/duplicate_op2.png differ diff --git a/paddle/framework/op_desc.proto b/paddle/framework/op_desc.proto index 89497f3c16bc28aa93b25a83c1f2eccafdf1c5b4..d95ba26f88ae181f991440e0df30c80f80a7eb2a 100644 --- a/paddle/framework/op_desc.proto +++ b/paddle/framework/op_desc.proto @@ -12,24 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -syntax="proto2"; +syntax = "proto2"; package paddle.framework; -import "attr_type.proto"; +import "attribute.proto"; // AttrDesc is used to describe Attributes of an Operator. It contain's // name, type, and value of Attribute. // // e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0 message AttrDesc { - required string name = 1; - required AttrType type = 2; - optional int32 i = 3; - optional float f = 4; - optional string s = 5; - repeated int32 ints = 6; - repeated float floats = 7; - repeated string strings = 8; + required string name = 1; + required AttrType type = 2; + optional int32 i = 3; + optional float f = 4; + optional string s = 5; + repeated int32 ints = 6; + repeated float floats = 7; + repeated string strings = 8; }; // Protocol Message to describe an Operator. @@ -42,15 +42,15 @@ message AttrDesc { // 3rd-party language can build this proto message and call // AddOp(const OpDesc& op_desc) of Paddle core to create an Operator. message OpDesc { - // input names of this Operator. - repeated string inputs = 1; + // input names of this Operator. + repeated string inputs = 1; - // output names of this Operator. - repeated string outputs = 2; + // output names of this Operator. + repeated string outputs = 2; - // type of this Operator, such as "add", "sub", "fc". - required string type = 3; + // type of this Operator, such as "add", "sub", "fc". + required string type = 3; - // Attributes of this Operator. e.g., scale=3.0 in cosine op. - repeated AttrDesc attrs = 4; + // Attributes of this Operator. e.g., scale=3.0 in cosine op. + repeated AttrDesc attrs = 4; }; \ No newline at end of file diff --git a/paddle/framework/op_proto.proto b/paddle/framework/op_proto.proto index 366c84e53dc29e41eefbaef0a6452e01c4fe37bd..52292162874b9ca207fb0d3917df41ade096b143 100644 --- a/paddle/framework/op_proto.proto +++ b/paddle/framework/op_proto.proto @@ -15,100 +15,102 @@ limitations under the License. */ // Protocol Message for 3rd-party language binding. // // Paddle Python package will use `OpProto` to generate op creation methods. -// The op creation methods take user's input and generate `OpDesc` proto message, +// The op creation methods take user's input and generate `OpDesc` proto +// message, // then pass `OpDesc` to C++ side and create Op pointer. // -syntax="proto2"; +syntax = "proto2"; package paddle.framework; -import "attr_type.proto"; +import "attribute.proto"; // Attribute protocol message for 3rd-party language binding. // It will store the Op support what attribute and what type. message AttrProto { - // Supported attribute name. e.g. `scale` for cosine op. - required string name = 1; + // Supported attribute name. e.g. `scale` for cosine op. + required string name = 1; - // Supported attribute type. - required AttrType type = 2; + // Supported attribute type. + required AttrType type = 2; - // Supported attribute comments. It helps 3rd-party language generate doc-string. - required string comment = 3; + // Supported attribute comments. It helps 3rd-party language generate + // doc-string. + required string comment = 3; - // If that attribute is generated, it means the Paddle third language - // binding has responsibility to fill that attribute. End-User should - // not set that attribute. - optional bool generated = 4 [default=false]; + // If that attribute is generated, it means the Paddle third language + // binding has responsibility to fill that attribute. End-User should + // not set that attribute. + optional bool generated = 4 [ default = false ]; } // Input or output message for 3rd-party language binding. // It contains parameter name and its comments. message VarProto { - // Input or output name in that op creation function. - // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names. - required string name = 1; - - // The comment for that input. It helps 3rd-party language generate doc-string. - required string comment = 2; - - // Is that input/output could be a list or not. - // If so, that Op should write a attributed named `input_format` or - // `output_format`. - // - // e.g. - // If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W` - // could be multiple, so the multiple of `X` and `W` is True, and OpDesc - // will hold a attribute of them. - // - // The Op desc of same fc could be - // { - // "type": "fc", - // "input": ["X1", "X2", "W1", "W2", "b"], - // "output": "fc.out", - // "attrs" : { - // "input_format": [0, 2, 4, 5] - // } - // } - // - optional bool multiple = 3 [default=false]; - - // It marks that output is a temporary output. That output is not used by - // user, but used by other op internally as input. If other op is not use - // that output, it could be optimized early. - // - // Attribute temporary_index will be set in OpDesc if there is some - // outputs are temporary. - // - // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"], - // attrs = { - // "temporary_index": [1] - // } - optional bool temporary = 4 [default=false]; - - // The gradient of operator can be ignored immediately - // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2 - // can be ignored for the future optimized on graph. - optional bool ignore_gradient = 6; + // Input or output name in that op creation function. + // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names. + required string name = 1; + + // The comment for that input. It helps 3rd-party language generate + // doc-string. + required string comment = 2; + + // Is that input/output could be a list or not. + // If so, that Op should write a attributed named `input_format` or + // `output_format`. + // + // e.g. + // If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W` + // could be multiple, so the multiple of `X` and `W` is True, and OpDesc + // will hold a attribute of them. + // + // The Op desc of same fc could be + // { + // "type": "fc", + // "input": ["X1", "X2", "W1", "W2", "b"], + // "output": "fc.out", + // "attrs" : { + // "input_format": [0, 2, 4, 5] + // } + // } + // + optional bool multiple = 3 [ default = false ]; + + // It marks that output is a temporary output. That output is not used by + // user, but used by other op internally as input. If other op is not use + // that output, it could be optimized early. + // + // Attribute temporary_index will be set in OpDesc if there is some + // outputs are temporary. + // + // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"], + // attrs = { + // "temporary_index": [1] + // } + optional bool temporary = 4 [ default = false ]; + + // The gradient of operator can be ignored immediately + // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2 + // can be ignored for the future optimized on graph. + optional bool ignore_gradient = 6; } // Op protocol message for 3rd-party language binding. // It contains all information for generating op creation method. message OpProto { - // The input information to generate op creation method. - repeated VarProto inputs = 1; + // The input information to generate op creation method. + repeated VarProto inputs = 1; - // The output information to generate op creation method. - repeated VarProto outputs = 2; + // The output information to generate op creation method. + repeated VarProto outputs = 2; - // The attribute information to generate op creation method. - repeated AttrProto attrs = 3; + // The attribute information to generate op creation method. + repeated AttrProto attrs = 3; - // The comments for that Op. It helps 3rd-party language generate - // doc-string. The whole documentation of that Op is generated by comment, - // inputs, outputs, attrs together. - required string comment = 4; - - // The type of that Op. - required string type = 5; + // The comments for that Op. It helps 3rd-party language generate + // doc-string. The whole documentation of that Op is generated by comment, + // inputs, outputs, attrs together. + required string comment = 4; + // The type of that Op. + required string type = 5; } diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc index 1d14535c50b542733663a6900a8b5f2033290ea6..1caa02a2a1d046778f875d04eeaef957be741302 100644 --- a/paddle/framework/op_registry.cc +++ b/paddle/framework/op_registry.cc @@ -14,37 +14,8 @@ limitations under the License. */ #include -namespace paddle { -namespace framework { - -template <> -void AttrTypeHelper::SetAttrType(AttrProto* attr) { - attr->set_type(paddle::framework::AttrType::INT); -} - -template <> -void AttrTypeHelper::SetAttrType(AttrProto* attr) { - attr->set_type(paddle::framework::AttrType::FLOAT); -} - -template <> -void AttrTypeHelper::SetAttrType(AttrProto* attr) { - attr->set_type(paddle::framework::AttrType::STRING); -} +#include -template <> -void AttrTypeHelper::SetAttrType>(AttrProto* attr) { - attr->set_type(paddle::framework::AttrType::INTS); -} - -template <> -void AttrTypeHelper::SetAttrType>(AttrProto* attr) { - attr->set_type(paddle::framework::AttrType::FLOATS); -} - -template <> -void AttrTypeHelper::SetAttrType>(AttrProto* attr) { - attr->set_type(paddle::framework::AttrType::STRINGS); -} -} // namespace framework +namespace paddle { +namespace framework {} // namespace framework } // namespace paddle diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index 384f0f631dd9b9a4dd7c0c628340afe668bc248f..6c26183818a9d6996e3d3ce2af74ba36f4711eca 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -19,7 +19,7 @@ limitations under the License. */ #include #include #include -#include "paddle/framework/attr_checker.h" +#include "paddle/framework/attribute.h" #include "paddle/framework/grad_op_builder.h" #include "paddle/framework/op_desc.pb.h" #include "paddle/framework/scope.h" @@ -27,49 +27,6 @@ limitations under the License. */ namespace paddle { namespace framework { -// helper class to set attribute type -struct AttrTypeHelper { - template - static void SetAttrType(AttrProto* attr); - - static Attribute GetAttrValue(const AttrDesc& attr_desc) { - switch (attr_desc.type()) { - case paddle::framework::AttrType::INT: { - return attr_desc.i(); - } - case paddle::framework::AttrType::FLOAT: { - return attr_desc.f(); - } - case paddle::framework::AttrType::STRING: { - return attr_desc.s(); - } - case paddle::framework::AttrType::INTS: { - std::vector val(attr_desc.ints_size()); - for (int i = 0; i < attr_desc.ints_size(); ++i) { - val[i] = attr_desc.ints(i); - } - return val; - } - case paddle::framework::AttrType::FLOATS: { - std::vector val(attr_desc.floats_size()); - for (int i = 0; i < attr_desc.floats_size(); ++i) { - val[i] = attr_desc.floats(i); - } - return val; - } - case paddle::framework::AttrType::STRINGS: { - std::vector val(attr_desc.strings_size()); - for (int i = 0; i < attr_desc.strings_size(); ++i) { - val[i] = attr_desc.strings(i); - } - return val; - } - } - PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !"); - return boost::blank(); - } -}; - // this class not only make proto but also init attribute checkers. class OpProtoAndCheckerMaker { public: @@ -86,43 +43,46 @@ class OpProtoAndCheckerMaker { } protected: - void AddInput(const std::string& name, const std::string& comment, - bool multiple = false, bool ignore_gradient = false) { + struct VariableBuilder { + VarProto* var_; + std::function on_multiple_; + std::function on_temporary_; + + VariableBuilder& SetMultiple() { + var_->set_multiple(true); + on_multiple_(); + return *this; + } + + VariableBuilder& SetTemporary() { + PADDLE_ENFORCE(bool(on_temporary_), "Cannot set temporary"); + var_->set_temporary(true); + on_temporary_(); + return *this; + } + + VariableBuilder& IgnoreGradient() { + var_->set_ignore_gradient(true); + return *this; + } + }; + + VariableBuilder AddInput(const std::string& name, + const std::string& comment) { auto input = proto_->mutable_inputs()->Add(); *input->mutable_name() = name; *input->mutable_comment() = comment; - input->set_ignore_gradient(ignore_gradient); - input->set_multiple(multiple); - if (multiple) { - SetHasMultipleInput(); - } - } - - void AddInputs(const std::string& name, const std::string& comment, - bool ignore_gradient = false) { - AddInput(name, comment, true, ignore_gradient); + return VariableBuilder{input, [=] { this->SetHasMultipleInput(); }, + nullptr}; } - void AddOutput(const std::string& name, const std::string& comment, - bool temporary = false, bool multiple = false, - bool ignore_gradient = false) { + VariableBuilder AddOutput(const std::string& name, + const std::string& comment) { auto output = proto_->mutable_outputs()->Add(); *output->mutable_name() = name; *output->mutable_comment() = comment; - output->set_ignore_gradient(ignore_gradient); - output->set_multiple(multiple); - if (multiple) { - SetHasMultipleOutput(); - } - output->set_temporary(temporary); - if (temporary) { - SetHasTemporaryOutput(); - } - } - - void AddOutputs(const std::string& name, const std::string& comment, - bool temporary = false, bool ignore_gradient = false) { - AddOutput(name, comment, temporary, true, ignore_gradient); + return VariableBuilder{output, [=] { this->SetHasMultipleOutput(); }, + [=] { this->SetHasTemporaryOutput(); }}; } template @@ -133,7 +93,7 @@ class OpProtoAndCheckerMaker { *attr->mutable_name() = name; *attr->mutable_comment() = comment; attr->set_generated(generated); - AttrTypeHelper::SetAttrType(attr); + attr->set_type(AttrTypeID()); return op_checker_->AddAttrChecker(name); } @@ -294,16 +254,16 @@ class OpRegistry { AttributeMap attrs; for (auto& attr : op_desc.attrs()) { - attrs[attr.name()] = AttrTypeHelper::GetAttrValue(attr); + attrs[attr.name()] = GetAttrValue(attr); } return CreateOp(op_desc.type(), inputs, outputs, attrs); } - static std::shared_ptr CreateGradOp( - std::shared_ptr op) { - GradOpBuilder builder(op.get()); - std::shared_ptr grad_op(builder.Build()); + static std::shared_ptr CreateGradOp(const OperatorBase& op) { + PADDLE_ENFORCE(!op.IsNetOp(), + "Use framework::Backward to get backward ops"); + std::shared_ptr grad_op(BuildGradOp(&op)); grad_op->Init(); return grad_op; } @@ -311,7 +271,7 @@ class OpRegistry { static std::unordered_map& protos() { static std::unordered_map protos_; return protos_; - }; + } static std::unordered_map& grad_ops() { static std::unordered_map grad_ops_; @@ -333,12 +293,12 @@ class OpRegistry { static std::unordered_map& op_checkers() { static std::unordered_map op_checkers_; return op_checkers_; - }; + } static void GenerateTempVariableName(OperatorBase* op) { static std::atomic gUniqId(0UL); for (auto& outname : op->outputs_) { - if (outname == OperatorBase::TMP_VAR_NAME()) { + if (outname == kTempVarName) { outname += op->type_; outname += "@"; outname += std::to_string(gUniqId.fetch_add(1)); @@ -350,7 +310,7 @@ class OpRegistry { template class OpRegisterHelper { public: - OpRegisterHelper(const char* op_type) { + explicit OpRegisterHelper(const char* op_type) { OpRegistry::RegisterOp(op_type); } }; @@ -396,6 +356,14 @@ class GradOpRegisterHelper { return 0; \ } +/** + * Macro to Forbid user register Gradient Operator. + */ +#define NO_GRADIENT(__op_type) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_gradient_op__##__op_type##__op_type##_grad, \ + "NO_GRADIENT must be in global namespace") + /** * Macro to Register OperatorKernel. */ diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc index 2ef781bf8672c8aa53ae32a44f1ea61973f3792c..9894928a7aa19bc6c7ad8b230562fb9a681cfebd 100644 --- a/paddle/framework/op_registry_test.cc +++ b/paddle/framework/op_registry_test.cc @@ -7,9 +7,9 @@ namespace paddle { namespace framework { class CosineOp : public OperatorBase { public: - void Run(const std::shared_ptr& scope, + void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const override {} - void InferShape(const std::shared_ptr& scope) const override {} + void InferShape(const Scope& scope) const override {} }; class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { @@ -27,8 +27,8 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { class MyTestOp : public OperatorBase { public: - void InferShape(const std::shared_ptr& scope) const override {} - void Run(const std::shared_ptr& scope, + void InferShape(const Scope& scope) const override {} + void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const override {} }; @@ -36,9 +36,8 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { public: MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInputs("input", "input of cosine op"); - AddOutput("output", "output of cosine op", - /*temporary*/ true); + AddInput("input", "input of cosine op").SetMultiple(); + AddOutput("output", "output of cosine op").SetTemporary(); auto my_checker = [](int i) { PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!"); }; @@ -69,7 +68,7 @@ TEST(OpRegistry, CreateOp) { std::shared_ptr op = paddle::framework::OpRegistry::CreateOp(op_desc); - auto scope = std::make_shared(); + paddle::framework::Scope scope; paddle::platform::CPUDeviceContext dev_ctx; op->Run(scope, dev_ctx); float scale_get = op->GetAttr("scale"); @@ -111,7 +110,7 @@ TEST(OpRegistry, DefaultValue) { std::shared_ptr op = paddle::framework::OpRegistry::CreateOp(op_desc); - auto scope = std::make_shared(); + paddle::framework::Scope scope; paddle::platform::CPUDeviceContext dev_ctx; op->Run(scope, dev_ctx); ASSERT_EQ(op->GetAttr("scale"), 1.0); @@ -173,7 +172,7 @@ TEST(OpRegistry, CustomChecker) { SetInputFormat(&op_desc); auto op = paddle::framework::OpRegistry::CreateOp(op_desc); paddle::platform::CPUDeviceContext dev_ctx; - auto scope = std::make_shared(); + paddle::framework::Scope scope; op->Run(scope, dev_ctx); int test_attr = op->GetAttr("test_attr"); ASSERT_EQ(test_attr, 4); diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 745508e6ac6e8f310a8ebd8b8d0762fb8ab39bd4..d3fbb1e96199b87887cd313d3cf31a230c9b8a34 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -20,16 +20,16 @@ namespace paddle { namespace framework { template <> -Eigen::DefaultDevice* KernelContext::GetEigenDevice< +Eigen::DefaultDevice& ExecutionContext::GetEigenDevice< platform::CPUPlace, Eigen::DefaultDevice>() const { - return device_context_->get_eigen_device(); + return *device_context_.get_eigen_device(); } #ifndef PADDLE_ONLY_CPU template <> -Eigen::GpuDevice* -KernelContext::GetEigenDevice() const { - return device_context_->get_eigen_device(); +Eigen::GpuDevice& +ExecutionContext::GetEigenDevice() const { + return *device_context_.get_eigen_device(); } #endif @@ -52,7 +52,8 @@ std::vector OperatorBase::Inputs(const std::string& name) const { PADDLE_ENFORCE(in_out_idxs_ != nullptr, "IO Idx could not be nullptr"); auto input_format = GetAttr>("input_format"); auto offset = in_out_idxs_->at(name); - PADDLE_ENFORCE(input_format.at((size_t)offset + 1) <= inputs_.size(), + PADDLE_ENFORCE(input_format.at(static_cast(offset) + 1) <= + static_cast(inputs_.size()), "Input Out Of Range"); return std::vector{ @@ -78,7 +79,8 @@ std::vector OperatorBase::Outputs(const std::string& name) const { PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr"); auto output_format = GetAttr>("output_format"); auto offset = in_out_idxs_->at(name); - PADDLE_ENFORCE(output_format.at((size_t)offset + 1) <= outputs_.size(), + PADDLE_ENFORCE(output_format.at(static_cast(offset) + 1) <= + static_cast(outputs_.size()), "Output Out of Range"); return std::vector{ outputs_.begin() + output_format.at(offset), @@ -115,5 +117,11 @@ std::string OperatorBase::DebugString() const { return ss.str(); } +void OperatorBase::Rename(const std::string& old_name, + const std::string& new_name) { + std::replace(inputs_.begin(), inputs_.end(), old_name, new_name); + std::replace(outputs_.begin(), outputs_.end(), old_name, new_name); +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index 8d9ca92565b3d34c5ede7c66cc9eb0d629e8d4e4..02707b8a9996fca80c96d76240e5e89cbb6f451e 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -14,12 +14,13 @@ limitations under the License. */ #pragma once +#include #include #include #include #include -#include "paddle/framework/attr_checker.h" +#include "paddle/framework/attribute.h" #include "paddle/framework/op_desc.pb.h" #include "paddle/framework/op_proto.pb.h" #include "paddle/framework/scope.h" @@ -31,22 +32,29 @@ limitations under the License. */ namespace paddle { namespace framework { -template -struct EigenDeviceConverter; +/// If a variable is a empty variable, that name will be used. +const std::string kEmptyVarName = "@EMPTY@"; -template <> -struct EigenDeviceConverter { - using EigenDeviceType = Eigen::DefaultDevice; -}; +/// If a variable is a temporary variable, that name will be set in Python, +/// but it will be convert to a unique name in scope after OpCreator. +const std::string kTempVarName = "@TEMP@"; -#ifndef PADDLE_ONLY_CPU -template <> -struct EigenDeviceConverter { - using EigenDeviceType = Eigen::GpuDevice; -}; -#endif +/// If a variable's name has a certain suffix, it means that the +/// variable is the gradient of another varibale. +/// e.g. Variable "x@GRAD" is the gradient of varibale "x". +const std::string kGradVarSuffix = "@GRAD"; + +/// Variables with this suffix are supposed to be filled up with zeros. +const std::string kZeroVarSuffix = "@ZERO"; + +inline std::string GradVarName(const std::string& var_name) { + return var_name + kGradVarSuffix; +} class OperatorBase; +class InferShapeContext; +class ExecutionContext; + /** * OperatorBase has the basic element that Net will call to do computation. * Only CreateOperator from OpRegistry will new Operator directly. User @@ -55,18 +63,6 @@ class OperatorBase; */ class OperatorBase { public: - /// If a variable is a empty variable, that name will be used. - static std::string EMPTY_VAR_NAME() { return "@EMPTY@"; } - - /// If a variable is a temporary variable, that name will be set in Python, - /// but it will be convert to a unique name in scope after OpCreator. - static std::string TMP_VAR_NAME() { return "@TEMP@"; } - - /// If a variable's name has a certain suffix, it means that the - /// variable is the gradient of another varibale. - /// e.g. Variable "x@GRAD" is the gradient of varibale "x". - static std::string GRAD_VAR_SUFFIX() { return "@GRAD"; } - virtual ~OperatorBase() {} template @@ -84,16 +80,20 @@ class OperatorBase { /// InferShape infer the size of Variables used by this Operator with /// information inside scope - virtual void InferShape(const std::shared_ptr& scope) const = 0; + virtual void InferShape(const Scope& scope) const = 0; /// Net will call this function to Run an op. - virtual void Run(const std::shared_ptr& scope, + virtual void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const = 0; virtual bool IsNetOp() const { return false; } + /// rename inputs outputs name + void Rename(const std::string& old_name, const std::string& new_name); + //! Get a input with argument's name described in `op_proto` const std::string& Input(const std::string& name) const; + //! Get a input which has multiple variables. //! TODO add a vector_view to prevent memory copy. std::vector Inputs(const std::string& name) const; @@ -105,99 +105,187 @@ class OperatorBase { public: std::string type_; + // NOTE: in case of OpGrad, inputs_ contains: + // I (Inputs) + // O (Outputs) + // OG (Output Gradients) std::vector inputs_; + // NOTE: in case of OpGrad, outputs_ contains + // IG (Inputs Gradients) std::vector outputs_; AttributeMap attrs_; // store the arguments' offset described in op_desc. std::shared_ptr> in_out_idxs_; }; -class KernelContext { +class OperatorContext { public: - KernelContext(const OperatorBase* op, const std::shared_ptr& scope, - const platform::DeviceContext& device_context) - : op_(*op), scope_(scope), device_context_(&device_context) {} + OperatorContext(const OperatorBase* op, const Scope& scope) + : op_(*op), scope_(scope) {} + + size_t InputSize() const { return op_.inputs_.size(); } - const Variable* Input(int index) const { - return scope_->GetVariable(op_.inputs_[index]); + size_t OutputSize() const { return op_.outputs_.size(); } + + const Variable* InputVar(const size_t index) const { + return scope_.FindVar(op_.inputs_.at(index)); } - Variable* Output(int index) const { - return scope_->GetVariable(op_.outputs_[index]); + Variable* OutputVar(const size_t index) const { + return scope_.FindVar(op_.outputs_.at(index)); } - const Variable* Input(const std::string& name) const { - return scope_->GetVariable(op_.Input(name)); + const Variable* InputVar(const std::string& name) const { + return scope_.FindVar(op_.Input(name)); } - const Variable* Output(const std::string& name) const { - return scope_->GetVariable(op_.Output(name)); + Variable* OutputVar(const std::string& name) const { + return scope_.FindVar(op_.Output(name)); } - const std::vector Inputs(const std::string& name) const { + const std::vector MultiInputVar( + const std::string& name) const { auto names = op_.Inputs(name); std::vector res; + res.reserve(names.size()); std::transform( - names.begin(), names.end(), res.begin(), - [this](const std::string& name) { return scope_->GetVariable(name); }); + names.begin(), names.end(), std::back_inserter(res), + [this](const std::string& name) { return scope_.FindVar(name); }); return res; } - const std::vector Outputs(const std::string& name) const { + std::vector MultiOutputVar(const std::string& name) const { auto names = op_.Outputs(name); std::vector res; + res.reserve(names.size()); std::transform( - names.begin(), names.end(), res.begin(), - [this](const std::string& name) { return scope_->GetVariable(name); }); + names.begin(), names.end(), std::back_inserter(res), + [this](const std::string& name) { return scope_.FindVar(name); }); + return res; + } + + template + const T* Input(const size_t index) const { + auto var = InputVar(index); + PADDLE_ENFORCE(var != nullptr, "Input(%d) should not be nullptr", index); + return &var->Get(); + } + + template + T* Output(const size_t index) const { + auto var = OutputVar(index); + PADDLE_ENFORCE(var != nullptr, "Output(%d) should not be nullptr", index); + return var->GetMutable(); + } + + template + const T* Input(const std::string& name) const { + auto var = InputVar(name); + PADDLE_ENFORCE(var != nullptr, "Input(%s) should not be nullptr", name); + return &var->Get(); + } + + template + T* Output(const std::string& name) const { + auto var = OutputVar(name); + PADDLE_ENFORCE(var != nullptr, "Output(%s) should not be nullptr", name); + return var->GetMutable(); + } + + template + const std::vector MultiInput(const std::string& name) const { + auto names = op_.Inputs(name); + std::vector res; + res.reserve(names.size()); + std::transform(names.begin(), names.end(), std::back_inserter(res), + [&](const std::string& sub_name) { + auto var = scope_.FindVar(sub_name); + PADDLE_ENFORCE(var != nullptr, + "MultiInput(%s:%s) should not be nullptr", + name, sub_name); + return &var->Get(); + }); + return res; + } + + template + std::vector MultiOutput(const std::string& name) const { + auto names = op_.Outputs(name); + std::vector res; + res.reserve(names.size()); + std::transform(names.begin(), names.end(), std::back_inserter(res), + [&](const std::string& sub_name) { + auto var = scope_.FindVar(sub_name); + PADDLE_ENFORCE(var != nullptr, + "MultiOutput(%s:%s) should not be nullptr", + name, sub_name); + return var->GetMutable(); + }); return res; } + const OperatorBase& op_; + const Scope& scope_; +}; + +class InferShapeContext : public OperatorContext { + public: + InferShapeContext(const OperatorBase* op, const Scope& scope) + : OperatorContext(op, scope) {} +}; + +template +struct EigenDeviceConverter; + +template <> +struct EigenDeviceConverter { + using EigenDeviceType = Eigen::DefaultDevice; +}; + +#ifndef PADDLE_ONLY_CPU +template <> +struct EigenDeviceConverter { + using EigenDeviceType = Eigen::GpuDevice; +}; +#endif + +class ExecutionContext : public OperatorContext { + public: + ExecutionContext(const OperatorBase* op, const Scope& scope, + const platform::DeviceContext& device_context) + : OperatorContext(op, scope), device_context_(device_context) {} + template ::EigenDeviceType> - DeviceType* GetEigenDevice() const; + DeviceType& GetEigenDevice() const; platform::Place GetPlace() const { return device_context_->GetPlace(); } - const OperatorBase& op_; - const std::shared_ptr scope_; - const platform::DeviceContext* device_context_; + const platform::DeviceContext& device_context_; }; class OpKernel { public: /** - * KernelContext is the only parameter of Kernel Run function. + * ExecutionContext is the only parameter of Kernel Run function. * Run will get input/output variables, state such as momentum and * device resource such as CUDA stream, cublas handle, etc. from - * KernelContext. User should construct it before run the Operator. + * ExecutionContext. User should construct it before run the Operator. */ - virtual void Compute(const KernelContext& context) const = 0; + virtual void Compute(const ExecutionContext& context) const = 0; virtual ~OpKernel() {} }; -template -struct VarToTensor {}; - -template <> -struct VarToTensor { - Tensor* operator()(Variable* var) { return var->GetMutable(); } -}; - -template <> -struct VarToTensor { - const Tensor* operator()(Variable* var) { return &var->Get(); } -}; - class OperatorWithKernel : public OperatorBase { public: struct OpKernelKey { platform::Place place_; OpKernelKey() = default; - OpKernelKey(const platform::DeviceContext& dev_ctx) { + explicit OpKernelKey(const platform::DeviceContext& dev_ctx) { place_ = dev_ctx.GetPlace(); } @@ -216,10 +304,14 @@ class OperatorWithKernel : public OperatorBase { using OpKernelMap = std::unordered_map, OpKernelHash>; - void Run(const std::shared_ptr& scope, + void InferShape(const Scope& scope) const { + InferShape(InferShapeContext(this, scope)); + } + + void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const final { auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx)); - opKernel->Compute(KernelContext(this, scope, dev_ctx)); + opKernel->Compute(ExecutionContext(this, scope, dev_ctx)); } static std::unordered_map& @@ -228,34 +320,8 @@ class OperatorWithKernel : public OperatorBase { return g_all_op_kernels; } - void InferShape(const std::shared_ptr& scope) const final { - std::vector ins; - VarNamesToTensors(scope, inputs_, &ins); - std::vector outs; - VarNamesToTensors(scope, outputs_, &outs); - InferShape(ins, outs); - }; - - private: - template - void VarNamesToTensors(const std::shared_ptr& scope, - const std::vector& var_names, - std::vector* container) const { - container->reserve(var_names.size()); - VarToTensor convert; - for (auto& name : var_names) { - auto var = scope->GetVariable(name); - if (var != nullptr) { - container->push_back(convert(var)); - } else { - container->push_back(nullptr); - } - } - } - protected: - virtual void InferShape(const std::vector& inputs, - const std::vector& outputs) const = 0; + virtual void InferShape(const InferShapeContext& ctx) const = 0; }; } // namespace framework diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc index 3fae356c3e5d5b44271440b66d6923fd4994b937..387aada749ba62246b44dedc050547c05955caa9 100644 --- a/paddle/framework/operator_test.cc +++ b/paddle/framework/operator_test.cc @@ -24,15 +24,15 @@ static int op_run_num = 0; class OpWithoutKernelTest : public OperatorBase { public: void Init() override { x = 1; } - void InferShape(const std::shared_ptr& scope) const override {} - void Run(const std::shared_ptr& scope, + void InferShape(const Scope& scope) const override {} + void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const override { op_run_num++; ASSERT_EQ((int)inputs_.size(), 1); ASSERT_EQ((int)outputs_.size(), 1); - ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr); + ASSERT_EQ(scope.FindVar(inputs_[0]), nullptr); ASSERT_EQ(x, 1); - ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr); + ASSERT_NE(scope.FindVar(outputs_[0]), nullptr); } public: @@ -68,11 +68,12 @@ TEST(OperatorBase, all) { attr->set_f(3.14); paddle::platform::CPUDeviceContext device_context; - auto scope = std::make_shared(); + paddle::framework::Scope scope; auto op = paddle::framework::OpRegistry::CreateOp(op_desc); - scope->CreateVariable("OUT1"); + scope.NewVar("OUT1"); ASSERT_EQ(paddle::framework::op_run_num, 0); + op->InferShape(scope); op->Run(scope, device_context); ASSERT_EQ(paddle::framework::op_run_num, 1); } @@ -97,14 +98,13 @@ static int cpu_kernel_run_num = 0; class OpWithKernelTest : public OperatorWithKernel { protected: - void InferShape(const std::vector& inputs, - const std::vector& outputs) const override {} + void InferShape(const framework::InferShapeContext& ctx) const override {} }; template class CPUKernelTest : public OpKernel { public: - void Compute(const KernelContext& ctx) const { + void Compute(const ExecutionContext& ctx) const { std::cout << "this is cpu kernel" << std::endl; std::cout << ctx.op_.DebugString() << std::endl; cpu_kernel_run_num++; @@ -117,12 +117,12 @@ class CPUKernelTest : public OpKernel { class OperatorMultiInputsTest : public OperatorBase { public: void Init() override { x = 1; } - void InferShape(const std::shared_ptr& scope) const override {} - void Run(const std::shared_ptr& scope, + void InferShape(const Scope& scope) const override {} + void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const override { - ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr); + ASSERT_EQ(scope.FindVar(inputs_[0]), nullptr); ASSERT_EQ(x, 1); - ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr); + ASSERT_NE(scope.FindVar(outputs_[0]), nullptr); ASSERT_EQ(Input("x"), "IN1"); ASSERT_EQ(Input("y"), "OUT1"); } @@ -137,9 +137,9 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInputs("xs", "inputs of test op"); + AddInput("xs", "inputs of test op").SetMultiple(); AddInput("k", "input of test op"); - AddOutputs("ys", "outputs of test op"); + AddOutput("ys", "outputs of test op").SetMultiple(); AddAttr("scale", "scale of cosine op") .SetDefault(1.0) .LargerThan(0.0); @@ -149,13 +149,31 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker class CPUKernalMultiInputsTest : public OpKernel { public: - void Compute(const KernelContext& ctx) const { + void Compute(const ExecutionContext& ctx) const { auto xs = ctx.op_.Inputs("xs"); ASSERT_EQ(xs.size(), 3UL); ASSERT_EQ(xs[0], "x0"); ASSERT_EQ(xs[1], "x1"); ASSERT_EQ(xs[2], "x2"); + auto inVar0 = ctx.MultiInputVar("xs"); + ASSERT_EQ(inVar0.size(), 3U); + + auto intVar1 = ctx.InputVar("k"); + ASSERT_NE(intVar1, nullptr); + + auto outVar0 = ctx.MultiOutputVar("ys"); + ASSERT_EQ(outVar0.size(), 2U); + + auto inTensor0 = ctx.MultiInput("xs"); + ASSERT_EQ(inTensor0.size(), 3U); + + auto intTensor1 = ctx.Input("k"); + ASSERT_NE(intTensor1, nullptr); + + auto outTensor0 = ctx.MultiOutput("ys"); + ASSERT_EQ(outTensor0.size(), 2U); + auto k = ctx.op_.Input("k"); ASSERT_EQ(k, "k0"); @@ -186,7 +204,7 @@ TEST(OpKernel, all) { attr->set_f(3.14); paddle::platform::CPUDeviceContext cpu_device_context; - auto scope = std::make_shared(); + paddle::framework::Scope scope; auto op = paddle::framework::OpRegistry::CreateOp(op_desc); ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0); @@ -232,7 +250,13 @@ TEST(OpKernel, multi_inputs) { output_format->Add(2); // y1 paddle::platform::CPUDeviceContext cpu_device_context; - auto scope = std::make_shared(); + paddle::framework::Scope scope; + scope.NewVar("x0")->GetMutable(); + scope.NewVar("x1")->GetMutable(); + scope.NewVar("x2")->GetMutable(); + scope.NewVar("k0")->GetMutable(); + scope.NewVar("y0")->GetMutable(); + scope.NewVar("y1")->GetMutable(); auto op = paddle::framework::OpRegistry::CreateOp(op_desc); op->Run(scope, cpu_device_context); diff --git a/paddle/framework/pybind.cc b/paddle/framework/pybind.cc new file mode 100644 index 0000000000000000000000000000000000000000..cbb86c4195a6c7e976fc5e0dd69d77be46dfb17c --- /dev/null +++ b/paddle/framework/pybind.cc @@ -0,0 +1,238 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/framework/backward.h" +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" +#include "paddle/framework/scope.h" +#include "paddle/framework/tensor_py.h" +#include "paddle/operators/net_op.h" +#include "paddle/operators/type_alias.h" +#include "paddle/platform/enforce.h" +#include "paddle/platform/place.h" +#include "pybind11/numpy.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace py = pybind11; + +USE_OP(add_two); +USE_OP(onehot_cross_entropy); +USE_OP_WITHOUT_KERNEL(fc); +USE_OP(sgd); +USE_OP(mul); +USE_OP(mean); +USE_OP(sigmoid); +USE_OP(softmax); +USE_OP(rowwise_add); +USE_OP_WITHOUT_KERNEL(recurrent_op); +namespace paddle { +namespace framework { +template +void ExposeOperator(ClassType &m) { + m.def("infer_shape", &ClassType::type::InferShape) + .def("run", &ClassType::type::Run) + .def("type", + [](const typename ClassType::type &op) -> std::string { + return op.type_; + }) + .def("outputs", + [](const typename ClassType::type &op) -> std::vector { + return op.outputs_; + }) + .def("__str__", &ClassType::type::DebugString); +} + +static size_t UniqueIntegerGenerator() { + static std::atomic generator; + return generator.fetch_add(1); +} + +bool IsCompileGPU() { +#ifdef PADDLE_ONLY_CPU + return false; +#else + return true; +#endif +} + +PYBIND11_PLUGIN(core) { + py::module m("core", "C++ core of PaddlePaddle"); + + py::class_(m, "Tensor", py::buffer_protocol()) + .def_buffer( + [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) + .def("get_dims", + [](const Tensor &self) { return vectorize(self.dims()); }) + .def("set_dims", + [](Tensor &self, const std::vector &dim) { + self.Resize(make_ddim(dim)); + }) + .def("alloc_float", + [](Tensor &self, paddle::platform::GPUPlace &place) { + self.mutable_data(place); + }) + .def("alloc_float", + [](Tensor &self, paddle::platform::CPUPlace &place) { + self.mutable_data(place); + }) + .def("alloc_int", + [](Tensor &self, paddle::platform::CPUPlace &place) { + self.mutable_data(place); + }) + .def("alloc_int", + [](Tensor &self, paddle::platform::GPUPlace &place) { + self.mutable_data(place); + }) + .def("set", PyCPUTensorSetFromArray) + .def("set", PyCPUTensorSetFromArray) +#ifndef PADDLE_ONLY_CPU + .def("set", PyCUDATensorSetFromArray) + .def("set", PyCUDATensorSetFromArray) +#endif + .def("shape", [](Tensor &self) { return vectorize(self.dims()); }) + .def("set_float_element", + [](Tensor &self, size_t offset, float f) { + // TODO(yuyang18): Only support GPU now. + self.data()[offset] = f; + }) + .def("get_float_element", [](Tensor &self, size_t offset) -> float { + // TODO(yuyang18): Only support GPU now. + return self.data()[offset]; + }); + + py::class_(m, "Variable", R"DOC(Variable Class. + +All parameter, weight, gradient are variables in Paddle. +)DOC") + .def("is_int", [](const Variable &var) { return var.IsType(); }) + .def("set_int", + [](Variable &var, int val) -> void { *var.GetMutable() = val; }) + .def("get_int", [](const Variable &var) -> int { return var.Get(); }) + .def("get_tensor", + [](Variable &self) -> Tensor * { return self.GetMutable(); }, + py::return_value_policy::reference) + .def("get_net", + [](Variable &self) -> ops::NetOp * { + return self.GetMutable(); + }, + py::return_value_policy::reference); + + py::class_(m, "Scope", "") + .def("new_var", + [](Scope &self, const std::string &name) -> Variable * { + return self.NewVar(name); + }, + py::return_value_policy::reference) + .def("find_var", &Scope::FindVar, py::return_value_policy::reference) + .def(py::init<>()) + .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); }, + py::return_value_policy::reference) + .def("drop_kids", &Scope::DropKids); + + //! @note: Be careful! PyBind will return std::string as an unicode, not + //! Python str. If you want a str object, you should cast them in Python. + m.def("get_all_op_protos", []() -> std::vector { + auto &protos = OpRegistry::protos(); + std::vector ret_values; + for (auto it = protos.begin(); it != protos.end(); ++it) { + PADDLE_ENFORCE(it->second.IsInitialized(), + "OpProto must all be initialized"); + std::string str; + PADDLE_ENFORCE(it->second.SerializeToString(&str), + "Serialize OpProto Error. This could be a bug of Paddle."); + ret_values.push_back(py::bytes(str)); + } + return ret_values; + }); + m.def_submodule( + "var_names", + "The module will return special predefined variable name in Paddle") + .def("empty", []() { return kEmptyVarName; }) + .def("temp", []() { return kTempVarName; }); + // clang-format off + py::class_(m, "DeviceContext") + .def_static("create", + [](paddle::platform::CPUPlace& place) + -> paddle::platform::DeviceContext* { + return new paddle::platform::CPUDeviceContext(); + }) + .def_static("create", + [](paddle::platform::GPUPlace& place) + -> paddle::platform::DeviceContext* { +#ifdef PADDLE_ONLY_CPU + PADDLE_THROW("GPUPlace is not supported in CPU device."); +#else + return new paddle::platform::CUDADeviceContext(place); +#endif + }); + // clang-format on + + py::class_(m, "GPUPlace").def(py::init()); + + py::class_(m, "CPUPlace").def(py::init<>()); + + py::class_> operator_base( + m, "Operator"); + + operator_base.def_static("create", [](py::bytes protobin) { + OpDesc desc; + PADDLE_ENFORCE(desc.ParsePartialFromString(protobin), + "Cannot parse user input to OpDesc"); + PADDLE_ENFORCE(desc.IsInitialized(), + "User OpDesc is not initialized, reason %s", + desc.InitializationErrorString()); + return OpRegistry::CreateOp(desc); + }); + + operator_base.def("backward", + [](const OperatorBase &forwardOp, + const std::unordered_set &no_grad_vars) { + return Backward(forwardOp, no_grad_vars); + }); + + ExposeOperator(operator_base); + + py::class_> net(m, "Net"); + + net.def_static("create", + []() -> std::shared_ptr { + auto retv = std::make_shared(); + retv->type_ = "plain_net"; + return retv; + }) + .def("add_op", &ops::NetOp::AddOp) + .def( + "add_op", + [](ops::NetOp &self, const std::shared_ptr &net) -> void { + self.AddOp(std::static_pointer_cast(net)); + }) + .def("complete_add_op", &ops::NetOp::CompleteAddOp) + .def("complete_add_op", + [](std::shared_ptr &self) { self->CompleteAddOp(); }); + + ExposeOperator(net); + + m.def("unique_integer", UniqueIntegerGenerator); + + m.def("is_compile_gpu", IsCompileGPU); + + return m.ptr(); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc new file mode 100644 index 0000000000000000000000000000000000000000..080b4ac621c1b8c0d4b4e7b26f394cf2be263894 --- /dev/null +++ b/paddle/framework/scope.cc @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/scope.h" +#include "paddle/string/printf.h" + +namespace paddle { +namespace framework { + +Scope::~Scope() { + DropKids(); + for (auto& kv : vars_) delete kv.second; +} + +Scope& Scope::NewScope() const { + kids_.push_back(new Scope(this)); + return *kids_.back(); +} + +Variable* Scope::NewVar(const std::string& name) { + auto iter = vars_.find(name); + if (iter != vars_.end()) { + return iter->second; + } + Variable* v = new Variable(); + vars_[name] = v; + v->name_ = &(vars_.find(name)->first); + return v; +} + +Variable* Scope::NewVar() { + return NewVar(string::Sprintf("%p.%d", this, vars_.size())); +} + +Variable* Scope::FindVar(const std::string& name) const { + auto it = vars_.find(name); + if (it != vars_.end()) return it->second; + return (parent_ == nullptr) ? nullptr : parent_->FindVar(name); +} + +const Scope* Scope::FindScope(const Variable* var) const { + for (auto& kv : vars_) { + if (kv.second == var) { + return this; + } + } + return (parent_ == nullptr) ? nullptr : parent_->FindScope(var); +} +void Scope::DropKids() { + for (Scope* s : kids_) delete s; + kids_.clear(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h index 4faaf841440ba30b79c83d09fea977186bd0270a..2ba3f8ed355b48800cfa4180e4e8a94f2c9958a9 100644 --- a/paddle/framework/scope.h +++ b/paddle/framework/scope.h @@ -14,9 +14,9 @@ limitations under the License. */ #pragma once +#include #include #include -#include #include "paddle/framework/variable.h" @@ -35,73 +35,42 @@ class Scope; */ class Scope { public: - /** - * @brief Initialize s Scope without parent. - */ Scope() {} + ~Scope(); - /** - * @brief Initialize a Scope with parent. - */ - explicit Scope(const std::shared_ptr& parent) : parent_(parent) {} - - /** - * @brief Create Variable - * - * Create Variable in this Scope. Return the exist one if Variable already - * been created. - */ - Variable* CreateVariable(const std::string& name) { - auto var = GetVariable(name); - if (var) { - return var; - } else { - auto ptr = new Variable(); - name_to_var_[name] = std::unique_ptr(ptr); - var_to_name_[ptr] = name; - return GetVariable(name); - } - } - - /** - * @brief Get Variable. - * - * Get Variable from this Scope, this function will recursive find Variable - * from it's parent scope. Return nullptr if not found. - */ - Variable* GetVariable(const std::string& name) const { - auto it = name_to_var_.find(name); - if (it != name_to_var_.end()) { - return it->second.get(); - } else if (parent_ != nullptr) { - return parent_->GetVariable(name); - } else { - return nullptr; - } - } - - /** - * @brief If this scope has a Var named name. - * - * Find if there is a Variable in this scope and it's parent scope - */ - bool HasVariable(const std::string& name) const { - return (name_to_var_.find(name) != name_to_var_.end() || - (parent_ && parent_->HasVariable(name))); - } - - std::string GetVariableName(Variable* const var) const { - try { - return var_to_name_.at(var); - } catch (...) { - return ""; - } - } + // Disable Copy, Assign, Move. + Scope(const Scope& other) = delete; + Scope& operator=(const Scope& other) = delete; + Scope(Scope&& other) = delete; + + /// Create a sub-scope. Returns a reference other than a pointer so + /// to prevent from manual deletion. + /// Mark it to const because that new kid scope cannot change parent scope. + Scope& NewScope() const; + + /// Create a variable with given name if it doesn't exist. + Variable* NewVar(const std::string& name); + + /// Create a variable with a scope-unique name. + Variable* NewVar(); + + /// Find a variable in the scope or any of its ancestors. Returns + /// nullptr if cannot find. + Variable* FindVar(const std::string& name) const; + + /// Find the scope or an ancestor scope that contains the given variable. + const Scope* FindScope(const Variable* var) const; + + /// Drop all kids scopes belonged to this scope. + void DropKids(); private: - std::unordered_map var_to_name_; - std::unordered_map> name_to_var_; - std::shared_ptr parent_{nullptr}; + // Call Scope::NewScope for a sub-scope. + explicit Scope(Scope const* parent) : parent_(parent) {} + + std::unordered_map vars_; + mutable std::list kids_; + Scope const* parent_{nullptr}; }; } // namespace framework diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc index ff069c7be002e9bcfd63225c3d80aa958935ba14..9d51e355b0f6336d2f875ff2d77266b261baf5ac 100644 --- a/paddle/framework/scope_test.cc +++ b/paddle/framework/scope_test.cc @@ -15,49 +15,42 @@ limitations under the License. */ #include "paddle/framework/scope.h" #include "gtest/gtest.h" -TEST(Scope, Create) { - using paddle::framework::Scope; - using paddle::framework::Variable; +using paddle::framework::Scope; +using paddle::framework::Variable; - auto scope = std::make_shared(); +TEST(Scope, VarsShadowing) { + Scope s; + Scope& ss1 = s.NewScope(); + Scope& ss2 = s.NewScope(); - Variable* var0 = scope->CreateVariable(""); - EXPECT_NE(var0, nullptr); + Variable* v0 = s.NewVar("a"); + Variable* v1 = ss1.NewVar("a"); - /// GetVariable will return nullptr if not exist. - Variable* var1 = scope->GetVariable("a"); - EXPECT_EQ(var1, nullptr); + EXPECT_NE(v0, v1); - /// CreateVariable will return one. - Variable* var2 = scope->CreateVariable("a"); - EXPECT_NE(var2, nullptr); - - /// Get the created variable. - Variable* var3 = scope->GetVariable("a"); - EXPECT_EQ(var2, var3); + EXPECT_EQ(v0, s.FindVar("a")); + EXPECT_EQ(v1, ss1.FindVar("a")); + EXPECT_EQ(v0, ss2.FindVar("a")); +} - /// CreateVariable will just return the variable if it's - /// already exist. - Variable* var4 = scope->CreateVariable("a"); - EXPECT_EQ(var4, var2); +TEST(Scope, FindVar) { + Scope s; + Scope& ss = s.NewScope(); - EXPECT_EQ("a", scope->GetVariableName(var4)); - Scope scope2; - auto var = scope2.CreateVariable("tmp"); - EXPECT_EQ("", scope->GetVariableName(var)); -} + EXPECT_EQ(nullptr, s.FindVar("a")); + EXPECT_EQ(nullptr, ss.FindVar("a")); -TEST(Scope, Parent) { - using paddle::framework::Scope; - using paddle::framework::Variable; + ss.NewVar("a"); - auto parent_scope = std::make_shared(); - auto scope = std::make_shared(parent_scope); + EXPECT_EQ(nullptr, s.FindVar("a")); + EXPECT_NE(nullptr, ss.FindVar("a")); +} - Variable* var0 = parent_scope->CreateVariable("a"); - EXPECT_NE(var0, nullptr); +TEST(Scope, FindScope) { + Scope s; + Scope& ss = s.NewScope(); + Variable* v = s.NewVar("a"); - /// GetVariable will get Variable from parent scope if exist. - Variable* var1 = scope->GetVariable("a"); - EXPECT_EQ(var0, var1); + EXPECT_EQ(&s, s.FindScope(v)); + EXPECT_EQ(&s, ss.FindScope(v)); } diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index 76070f636b0971f4a136042e056c59adb5dc2d40..4c3b14b83d841e88683a13634c93f51c012128b6 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -26,19 +26,17 @@ limitations under the License. */ #include "unsupported/Eigen/CXX11/Tensor" namespace paddle { -namespace pybind { -namespace details { // forward declare -template -struct CastToPyBufferImpl; -} // namespace details -} // namespace pybind namespace framework { +namespace details { +template +struct CastToPyBufferImpl; +} class Tensor { public: template - friend struct paddle::pybind::details::CastToPyBufferImpl; + friend struct details::CastToPyBufferImpl; template friend struct EigenTensor; @@ -167,4 +165,4 @@ class Tensor { } // namespace framework } // namespace paddle -#include "paddle/framework/detail/tensor-inl.h" +#include "paddle/framework/tensor_impl.h" diff --git a/paddle/framework/detail/tensor-inl.h b/paddle/framework/tensor_impl.h similarity index 97% rename from paddle/framework/detail/tensor-inl.h rename to paddle/framework/tensor_impl.h index e7ff09dd5c954378afeca299e901277c3ebdb96a..92621f8c18ec0d03160a23c462830d14272c7f64 100644 --- a/paddle/framework/detail/tensor-inl.h +++ b/paddle/framework/tensor_impl.h @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - #include "paddle/memory/memcpy.h" namespace paddle { @@ -62,9 +61,11 @@ inline T* Tensor::mutable_data(platform::Place place) { if (platform::is_cpu_place(place)) { holder_.reset(new PlaceholderImpl( boost::get(place), size)); + } else if (platform::is_gpu_place(place)) { +#ifdef PADDLE_ONLY_CPU + PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); } -#ifndef PADDLE_ONLY_CPU - else if (platform::is_gpu_place(place)) { +#else holder_.reset(new PlaceholderImpl( boost::get(place), size)); } diff --git a/paddle/pybind/tensor_bind.h b/paddle/framework/tensor_py.h similarity index 64% rename from paddle/pybind/tensor_bind.h rename to paddle/framework/tensor_py.h index 995e102bf9d342e1604f5ae704288d6cf68d97a4..4e1ab77b157fe1adaeac55c271c056236f2d40de 100644 --- a/paddle/pybind/tensor_bind.h +++ b/paddle/framework/tensor_py.h @@ -13,15 +13,17 @@ limitations under the License. */ #pragma once -#include -#include -#include +#include +#include "paddle/framework/tensor.h" +#include "paddle/memory/memcpy.h" +#include "pybind11/numpy.h" +#include "pybind11/pybind11.h" namespace py = pybind11; namespace paddle { -namespace pybind { +namespace framework { namespace details { @@ -40,9 +42,6 @@ template struct CastToPyBufferImpl { using CUR_TYPE = typename std::tuple_element>::type; py::buffer_info operator()(framework::Tensor &tensor) { - PADDLE_ENFORCE(paddle::platform::is_cpu_place(tensor.holder_->place()), - "Only CPU tensor can cast to numpy array"); - if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) { auto dim_vec = framework::vectorize(tensor.dims()); std::vector dims_outside; @@ -56,14 +55,16 @@ struct CastToPyBufferImpl { strides[i - 1] = sizeof(CUR_TYPE) * prod; prod *= dims_outside[i - 1]; } - + framework::Tensor dst_tensor; + if (paddle::platform::is_gpu_place(tensor.holder_->place())) { + dst_tensor.CopyFrom(tensor, platform::CPUPlace()); + } else if (paddle::platform::is_cpu_place(tensor.holder_->place())) { + dst_tensor = tensor; + } return py::buffer_info( - tensor.mutable_data(tensor.holder_->place()), - sizeof(CUR_TYPE), - py::format_descriptor::format(), - (size_t)framework::arity(tensor.dims()), - dims_outside, - strides); + dst_tensor.mutable_data(dst_tensor.holder_->place()), + sizeof(CUR_TYPE), py::format_descriptor::format(), + (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides); } else { constexpr bool less = I + 1 < std::tuple_size>::value; return CastToPyBufferImpl()(tensor); @@ -77,9 +78,10 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) { } template -void PyTensorSetFromArray( +void PyCPUTensorSetFromArray( framework::Tensor &self, - py::array_t array) { + py::array_t array, + paddle::platform::CPUPlace &place) { std::vector dims; dims.reserve(array.ndim()); for (size_t i = 0; i < array.ndim(); ++i) { @@ -87,9 +89,28 @@ void PyTensorSetFromArray( } self.Resize(framework::make_ddim(dims)); - auto *dst = self.mutable_data(paddle::platform::CPUPlace()); + auto *dst = self.mutable_data(place); std::memcpy(dst, array.data(), sizeof(T) * array.size()); } +#ifndef PADDLE_ONLY_CPU +template +void PyCUDATensorSetFromArray( + framework::Tensor &self, + py::array_t array, + paddle::platform::GPUPlace &place) { + std::vector dims; + dims.reserve(array.ndim()); + for (size_t i = 0; i < array.ndim(); ++i) { + dims.push_back((int)array.shape()[i]); + } + + self.Resize(framework::make_ddim(dims)); + auto *dst = self.mutable_data(place); + paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(), + cudaMemcpyHostToDevice); +} +#endif + } // namespace pybind } // namespace paddle diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h index 72c4a7a2a1d1cf93a784f24e687727ee8481484c..38fc2720a3023039aa113b32a394bda9c5def4c0 100644 --- a/paddle/framework/variable.h +++ b/paddle/framework/variable.h @@ -16,7 +16,7 @@ #include #include -#include "paddle/platform/assert.h" +#include "paddle/platform/enforce.h" namespace paddle { namespace framework { @@ -25,7 +25,7 @@ class Variable { public: template const T& Get() const { - PADDLE_ASSERT(IsType()); + PADDLE_ENFORCE(IsType(), "Variable must be type %s", typeid(T).name()); return *static_cast(holder_->Ptr()); } @@ -65,6 +65,17 @@ class Variable { std::unique_ptr holder_; // pointers to a PlaceholderImpl object indeed. + + // name_ is only meaningful with a Scope and accessible by it. + // + // NOTE: Please don't expose name_ by adding methods like + // Variable::Name or Scope::VarName! A variable could have a human + // readable name or an auto-generated scope-unique name. In the + // former case, the caller knows the name and doesn't need to access + // the name; in the latter case, the variable should be identified + // by its address but not the unreadable name. + friend class Scope; + const std::string* name_; }; } // namespace framework diff --git a/paddle/function/BlockExpandOpTest.cpp b/paddle/function/BlockExpandOpTest.cpp index 5e4897e72ba9fab2dd9e25d90313dc1b4d38e2d4..59193a3ec3d0fabe7c841372394204ab568f5a2b 100644 --- a/paddle/function/BlockExpandOpTest.cpp +++ b/paddle/function/BlockExpandOpTest.cpp @@ -18,10 +18,10 @@ limitations under the License. */ namespace paddle { TEST(BlockExpandForward, real) { - for (size_t batchSize : {5, 32}) { - for (size_t channels : {1, 5, 32}) { - for (size_t inputHeight : {5, 33, 100}) { - for (size_t inputWidth : {5, 32, 96}) { + for (size_t batchSize : {5}) { + for (size_t channels : {1, 5}) { + for (size_t inputHeight : {5, 33}) { + for (size_t inputWidth : {5, 32}) { for (size_t block : {1, 3, 5}) { for (size_t stride : {1, 2}) { for (size_t padding : {0, 1}) { @@ -61,10 +61,10 @@ TEST(BlockExpandForward, real) { } TEST(BlockExpandBackward, real) { - for (size_t batchSize : {5, 32}) { - for (size_t channels : {1, 5, 32}) { - for (size_t inputHeight : {5, 33, 100}) { - for (size_t inputWidth : {5, 32, 96}) { + for (size_t batchSize : {5}) { + for (size_t channels : {1, 5}) { + for (size_t inputHeight : {5, 33}) { + for (size_t inputWidth : {5, 32}) { for (size_t block : {1, 3, 5}) { for (size_t stride : {1, 2}) { for (size_t padding : {0, 1}) { diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp index 1744f377808f137dcda4a28acce336dc22be3d01..6b8e1e2da9775ccd03c84cc86ad226f3c00ab7fe 100644 --- a/paddle/function/BufferArgTest.cpp +++ b/paddle/function/BufferArgTest.cpp @@ -32,7 +32,7 @@ TEST(BufferTest, SequenceIdArg) { sizeOfValuType(VALUE_TYPE_INT32)); SequenceIdArg buffer(memory.getBuf(), shape); EXPECT_EQ(buffer.data(), memory.getBuf()); - EXPECT_EQ(buffer.numSeqs(), 9); + EXPECT_EQ(buffer.numSeqs(), 9U); } } // namespace paddle diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu index 1a5b4042402df3081a493962a5e080d72b7f40b2..4492dea5d8a6f8580a13f3059401c87fa2164085 100644 --- a/paddle/function/ContextProjectionOpGpu.cu +++ b/paddle/function/ContextProjectionOpGpu.cu @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "ContextProjectionOp.h" +#include "hl_base.h" namespace paddle { @@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input, int block_size = blockDim.x; int sequenceId = blockIdx.x; int seq_start = sequence[sequenceId]; - int seq_end = sequence[sequenceId+1]; + int seq_end = sequence[sequenceId + 1]; real value = 0; int instances = seq_end - seq_start + context_length - 1; @@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input, } else if ((i + context_start) >= (seq_end - seq_start)) { if (padding) { value = - weight[(begin_pad + i + context_start - (seq_end - seq_start)) * - input_dim + idx]; + weight[(begin_pad + i + context_start - (seq_end - seq_start)) * + input_dim + + idx]; } else { continue; } @@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input, int outx = (i - context_length) < 0 ? i : (context_length - 1); int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1)); real* output_r = - output + outy * input_dim * context_length + outx * input_dim; + output + outy * input_dim * context_length + outx * input_dim; for (int j = outy; j < seq_end - seq_start; j++) { output_r[idx] += value; if (j - outy == outx) break; @@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input, dim3 grid(blocks_x, blocks_y); if (weight) { - KeContextProjectionForward<<< grid, threads, 0, STREAM_DEFAULT >>> - (input, sequence, weight, output, input_dim, - context_length, context_start, begin_pad); - } else { - KeContextProjectionForward<<< grid, threads, 0, STREAM_DEFAULT >>> - (input, sequence, weight, output, input_dim, - context_length, context_start, begin_pad); + KeContextProjectionForward<<>>( + input, + sequence, + weight, + output, + input_dim, + context_length, + context_start, + begin_pad); + } else { + KeContextProjectionForward<<>>( + input, + sequence, + weight, + output, + input_dim, + context_length, + context_start, + begin_pad); } CHECK_SYNC("hl_context_projection_forward failed"); } @@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad, int block_size = blockDim.x; int sequenceId = blockIdx.x; int seq_start = sequence[sequenceId]; - int seq_end = sequence[sequenceId+1]; + int seq_end = sequence[sequenceId + 1]; real value = 0; int instances = seq_end - seq_start + context_length - 1; @@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad, int outx = (i - context_length) < 0 ? i : (context_length - 1); int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1)); real* output_r = - out + outy * input_dim * context_length + outx * input_dim; + out + outy * input_dim * context_length + outx * input_dim; for (int j = outy; j < seq_end - seq_start; j++) { value += output_r[idx]; if (j - outy == outx) break; @@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad, int blocks_y = 1; dim3 threads(block_size, 1); dim3 grid(blocks_x, blocks_y); - KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>> - (out_grad, sequence, input_grad, input_dim, context_length, context_start); + KeContextProjectionBackwardData<<>>( + out_grad, sequence, input_grad, input_dim, context_length, context_start); CHECK_SYNC("hl_context_projection_backward_data failed"); } @@ -231,7 +244,7 @@ void ContextProjectionBackwardData(const GpuMatrix& out_grad, context_start); } -template +template __global__ void KeContextProjectionBackwardWeight(const real* out_grad, const int* sequence, real* w_grad, @@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, if (weight_idx < w_dim) { for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) { int seq_start = sequence[seqId]; - int seq_end = sequence[seqId+1]; - output_r = const_cast(out_grad) - + seq_start * w_dim * context_length; + int seq_end = sequence[seqId + 1]; + output_r = + const_cast(out_grad) + seq_start * w_dim * context_length; if (context_start < 0) { if (padId + context_start < 0) { instanceId = padId; } else { // begin_pad > 0; - instanceId = (padId - begin_pad) + - (seq_end - seq_start) - context_start; + instanceId = + (padId - begin_pad) + (seq_end - seq_start) - context_start; } } else { if (padId + (seq_end - seq_start) < context_start) { @@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, } } - int outx = (instanceId - context_length) < 0 ? - instanceId : (context_length - 1); - int outy = (instanceId - context_length) < 0 ? - 0 : (instanceId - (context_length - 1)); + int outx = + (instanceId - context_length) < 0 ? instanceId : (context_length - 1); + int outy = (instanceId - context_length) < 0 + ? 0 + : (instanceId - (context_length - 1)); output_r += outy * w_dim * context_length + outx * w_dim; for (int j = outy; j < seq_end - seq_start; j++) { value += output_r[weight_idx]; @@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, } __syncthreads(); - for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) { + for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) { if (idy < stride) { sum_s[idy][idx] += sum_s[idy + stride][idx]; } @@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad, dim3 threads(threads_x, threads_y); dim3 grid(blocks_x, 1); - KeContextProjectionBackwardWeight<32, 32> - <<< grid, threads, 0, STREAM_DEFAULT >>> - (out_grad, sequence, w_grad, num_sequences, w_dim, - context_length, context_start, begin_pad); + KeContextProjectionBackwardWeight<32, + 32><<>>( + out_grad, + sequence, + w_grad, + num_sequences, + w_dim, + context_length, + context_start, + begin_pad); CHECK_SYNC("hl_context_projection_backward_weight failed"); } template <> -void ContextProjectionBackwardWeight( - const GpuMatrix& out_grad, - GpuMatrix& w_grad, - const GpuIVector& seq_vec, - size_t context_length, - int context_start, - size_t total_pad, - size_t begin_pad) { +void ContextProjectionBackwardWeight(const GpuMatrix& out_grad, + GpuMatrix& w_grad, + const GpuIVector& seq_vec, + size_t context_length, + int context_start, + size_t total_pad, + size_t begin_pad) { hl_context_projection_backward_weight(out_grad.getData(), seq_vec.getData(), w_grad.getData(), @@ -376,23 +395,18 @@ void ContextProjectionBackward(const GpuMatrix& out_grad, size_t begin_pad, bool is_padding, size_t total_pad) { - if (in_grad) { - ContextProjectionBackwardData( - out_grad, - in_grad, - sequence, - context_length, - context_start); - } - if (is_padding && w_grad) { - ContextProjectionBackwardWeight( - out_grad, - w_grad, - sequence, - context_length, - context_start, - total_pad, - begin_pad); + if (in_grad) { + ContextProjectionBackwardData( + out_grad, in_grad, sequence, context_length, context_start); + } + if (is_padding && w_grad) { + ContextProjectionBackwardWeight(out_grad, + w_grad, + sequence, + context_length, + context_start, + total_pad, + begin_pad); } } diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h index bb4f48364b9b454af7d37fe4d3c340666e53285c..baf78bc6c88d0d294f4457b81c52b22e425d9fdb 100644 --- a/paddle/function/ConvOp.h +++ b/paddle/function/ConvOp.h @@ -109,6 +109,13 @@ protected: return filter[filter.ndims() - 1]; } + // determine whether im2col needs to be performed + inline bool isNeedIm2col(const TensorShape& filter) const { + return !(getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 && + strideH() == 1 && strideW() == 1 && paddingH() == 0 && + paddingW() == 0); + } + std::vector strides_; std::vector paddings_; diff --git a/paddle/function/CosSimOpGpu.cu b/paddle/function/CosSimOpGpu.cu index c62ab39551f02288618244871ae31c6800df5b42..a1f88f479b5818e3864129a4dac723bceed76fcf 100644 --- a/paddle/function/CosSimOpGpu.cu +++ b/paddle/function/CosSimOpGpu.cu @@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "CosSimOp.h" #include "hl_base.h" #include "hl_device_functions.cuh" -#include "CosSimOp.h" namespace paddle { -template +template __global__ void KeCosSim(real* output, const real* input1, const real* input2, @@ -78,8 +78,8 @@ void hlCossim(real* output, dim3 threads(block_size, 1); dim3 grid(1, input1_height); - KeCosSim<<>> - (output, input1, input2, width, input1_height, input2_height, scale); + KeCosSim<<>>( + output, input1, input2, width, input1_height, input2_height, scale); CHECK_SYNC("hlCossim failed"); } @@ -99,7 +99,7 @@ void CosSimForward(GpuMatrix& out_mat, hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale); } -template +template __global__ void KeCosSimDerivative(const real* grad, const real* output, const real* prev_out_x, @@ -148,14 +148,13 @@ __global__ void KeCosSimDerivative(const real* grad, if (xy[0] == 0) { real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0])); for (int index = tid; index < width; index += block_size) { - prev_grad_x[index] += - scale * grad[ty] * prev_out_y[index] * reciprocal; + prev_grad_x[index] += scale * grad[ty] * prev_out_y[index] * reciprocal; if (input2_height > 1) { - prev_grad_y[index] += - scale * grad[ty] * prev_out_x[index] * reciprocal; + prev_grad_y[index] += scale * grad[ty] * prev_out_x[index] * reciprocal; } else { - paddle::paddleAtomicAdd(prev_grad_y + index, - scale * grad[ty] * prev_out_x[index] * reciprocal); + paddle::paddleAtomicAdd( + prev_grad_y + index, + scale * grad[ty] * prev_out_x[index] * reciprocal); } } } else { @@ -163,17 +162,18 @@ __global__ void KeCosSimDerivative(const real* grad, real reciprocalSquareSumX = 1.0 / xx[0]; real reciprocalSquareSumY = 1.0 / yy[0]; for (int index = tid; index < width; index += block_size) { - prev_grad_x[index] += output[ty] * grad[ty] * - (prev_out_y[index] * reciprocalXY - - prev_out_x[index] * reciprocalSquareSumX); + prev_grad_x[index] += + output[ty] * grad[ty] * (prev_out_y[index] * reciprocalXY - + prev_out_x[index] * reciprocalSquareSumX); if (input2_height > 1) { - prev_grad_y[index] += output[ty] * grad[ty] * - (prev_out_x[index] * reciprocalXY - - prev_out_y[index] * reciprocalSquareSumY); + prev_grad_y[index] += + output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY - + prev_out_y[index] * reciprocalSquareSumY); } else { - paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] * - (prev_out_x[index] * reciprocalXY - - prev_out_y[index] * reciprocalSquareSumY)); + paddle::paddleAtomicAdd( + prev_grad_y + index, + output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY - + prev_out_y[index] * reciprocalSquareSumY)); } } } @@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad, const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, input1_height); - KeCosSimDerivative<<>> - (grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width, - input1_height, input2_height, scale); + KeCosSimDerivative<<>>( + grad, + output, + prev_out_x, + prev_out_y, + prev_grad_x, + prev_grad_y, + width, + input1_height, + input2_height, + scale); CHECK_SYNC("hlCossimDerivate failed"); } @@ -214,9 +222,9 @@ void CosSimBackward(const GpuMatrix& out_grad, real scale) { CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() && in2_val.getData() && in1_grad.getData() && in2_grad.getData()); - CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ - && in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_) - << "Matrix types are not equally GPU"; + CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ && + in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_) + << "Matrix types are not equally GPU"; size_t dim = in1_val.getWidth(); const real* grad = out_grad.getData(); diff --git a/paddle/function/CropOpGpu.cu b/paddle/function/CropOpGpu.cu index 786eb268d45aadee0c1f6fcbbafc23173cf0bc77..241356a9ca0b673c86ff4c39594722211e2d224e 100644 --- a/paddle/function/CropOpGpu.cu +++ b/paddle/function/CropOpGpu.cu @@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "CropOp.h" +#include "hl_base.h" namespace paddle { -__global__ void KeCrop(real* outputs, const real* inputs, - int inC, int inH, int inW, - int cropC, int cropH, int cropW, - int outC, int outH, int outW, int nthreads) { +__global__ void KeCrop(real* outputs, + const real* inputs, + int inC, + int inH, + int inW, + int cropC, + int cropH, + int cropW, + int outC, + int outH, + int outW, + int nthreads) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < nthreads) { const int w = idx % outW; @@ -35,12 +43,12 @@ __global__ void KeCrop(real* outputs, const real* inputs, template <> void Crop(real* outputs, - const real* inputs, - const TensorShape inShape, - const TensorShape outShape, - const FuncConfig& conf) { + const real* inputs, + const TensorShape inShape, + const TensorShape outShape, + const FuncConfig& conf) { std::vector crop_corner = - conf.get>("crop_corner"); + conf.get>("crop_corner"); int cropC = crop_corner[1]; int cropH = crop_corner[2]; int cropW = crop_corner[3]; @@ -58,16 +66,33 @@ void Crop(real* outputs, int blockSize = 1024; int gridSize = (nth + blockSize - 1) / blockSize; - KeCrop<<>> - (outputs, inputs, inC, inH, inW, cropC, cropH, cropW, - outC, outH, outW, nth); + KeCrop<<>>(outputs, + inputs, + inC, + inH, + inW, + cropC, + cropH, + cropW, + outC, + outH, + outW, + nth); CHECK_SYNC("Crop"); } -__global__ void KeCropDiff(const real* inGrad, real* outGrad, - int inC, int inH, int inW, - int cropC, int cropH, int cropW, - int outC, int outH, int outW, int nthreads) { +__global__ void KeCropDiff(const real* inGrad, + real* outGrad, + int inC, + int inH, + int inW, + int cropC, + int cropH, + int cropW, + int outC, + int outH, + int outW, + int nthreads) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < nthreads) { const int w = idx % inW; @@ -84,12 +109,12 @@ __global__ void KeCropDiff(const real* inGrad, real* outGrad, template <> void CropGrad(const real* inGrad, - real* outGrad, - const TensorShape inShape, - const TensorShape outShape, - const FuncConfig& conf) { + real* outGrad, + const TensorShape inShape, + const TensorShape outShape, + const FuncConfig& conf) { std::vector crop_corner = - conf.get>("crop_corner"); + conf.get>("crop_corner"); int cropC = crop_corner[1]; int cropH = crop_corner[2]; int cropW = crop_corner[3]; @@ -107,9 +132,18 @@ void CropGrad(const real* inGrad, int blockSize = 1024; int gridSize = (nth + blockSize - 1) / blockSize; - KeCropDiff <<>> - (inGrad, outGrad, inC, inH, inW, cropC, cropH, cropW, - outC, outH, outW, nth); + KeCropDiff<<>>(inGrad, + outGrad, + inC, + inH, + inW, + cropC, + cropH, + cropW, + outC, + outH, + outW, + nth); CHECK_SYNC("CropGrad"); } diff --git a/paddle/function/CrossMapNormalOpGpu.cu b/paddle/function/CrossMapNormalOpGpu.cu index b33dd108348b7789c6e73bfe3b1ffbc448163ef7..88b991ff6a1f028b333e82e2801ed2e9251aa36d 100644 --- a/paddle/function/CrossMapNormalOpGpu.cu +++ b/paddle/function/CrossMapNormalOpGpu.cu @@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "CrossMapNormalOp.h" +#include "hl_base.h" namespace paddle { -__global__ void KeCMRNormFillScale(size_t imageSize, const real* in, - real* scale, size_t channels, - size_t height, size_t width, size_t size, +__global__ void KeCMRNormFillScale(size_t imageSize, + const real* in, + real* scale, + size_t channels, + size_t height, + size_t width, + size_t size, real alpha) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < imageSize) { @@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in, } } -__global__ void KeCMRNormOutput(size_t inputSize, const real* in, - const real* scale, real negative_beta, +__global__ void KeCMRNormOutput(size_t inputSize, + const real* in, + const real* scale, + real negative_beta, real* out) { const int index = threadIdx.x + blockIdx.x * blockDim.x; if (index < inputSize) { @@ -74,24 +80,30 @@ void CrossMapNormal(real* outputs, size_t imageSize = numSamples * height * width; int blockSize = 1024; int gridSize = (imageSize + 1024 - 1) / 1024; - KeCMRNormFillScale<<>> - (imageSize, inputs, denoms, channels, height, width, size, scale); + KeCMRNormFillScale<<>>( + imageSize, inputs, denoms, channels, height, width, size, scale); - size_t inputSize = numSamples * height * width *channels; + size_t inputSize = numSamples * height * width * channels; blockSize = 1024; gridSize = (inputSize + 1024 - 1) / 1024; - KeCMRNormOutput<<>> - (inputSize, inputs, denoms, -pow, outputs); + KeCMRNormOutput<<>>( + inputSize, inputs, denoms, -pow, outputs); CHECK_SYNC("CrossMapNormal"); } -__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data, - const real* top_data, const real* scale, - const real* top_diff, size_t channels, - size_t height, size_t width, size_t size, - real negative_beta, real cache_ratio, - real* bottom_diff ) { +__global__ void KeCMRNormDiff(size_t imageSize, + const real* bottom_data, + const real* top_data, + const real* scale, + const real* top_diff, + size_t channels, + size_t height, + size_t width, + size_t size, + real negative_beta, + real cache_ratio, + real* bottom_diff) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < imageSize) { const int w = idx % width; @@ -113,17 +125,17 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data, while (index < channels + post_pad) { if (index < channels) { accum += top_diff[index * step] * top_data[index * step] / - scale[index * step]; + scale[index * step]; } if (index >= size) { accum -= top_diff[(index - size) * step] * - top_data[(index - size) * step] / scale[(index - size) * step]; + top_data[(index - size) * step] / scale[(index - size) * step]; } if (index >= post_pad) { bottom_diff[(index - post_pad) * step] += - top_diff[(index - post_pad) * step] * - pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio * - bottom_data[(index - post_pad) * step] * accum; + top_diff[(index - post_pad) * step] * + pow(scale[(index - post_pad) * step], negative_beta) - + cache_ratio * bottom_data[(index - post_pad) * step] * accum; } ++index; } @@ -147,9 +159,18 @@ void CrossMapNormalGrad(real* inputsGrad, int blockSize = 1024; int gridSize = (imageSize + 1024 - 1) / 1024; - KeCMRNormDiff <<>> - (imageSize, inputsValue, outputsValue, denoms, outputsGrad, channels, - height, width, size, -pow, 2.0f * pow * scale, inputsGrad); + KeCMRNormDiff<<>>(imageSize, + inputsValue, + outputsValue, + denoms, + outputsGrad, + channels, + height, + width, + size, + -pow, + 2.0f * pow * scale, + inputsGrad); CHECK_SYNC("CrossMapNormalGrad"); } diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/function/CrossMapNormalOpTest.cpp index ed17b17da616db9d52318f21c133458d698b0dd8..3b390db77f085aecfd65a9aa64e68ecc189163c1 100644 --- a/paddle/function/CrossMapNormalOpTest.cpp +++ b/paddle/function/CrossMapNormalOpTest.cpp @@ -18,11 +18,11 @@ limitations under the License. */ namespace paddle { TEST(CrossMapNormal, real) { - for (size_t numSamples : {5, 32}) { - for (size_t channels : {1, 5, 32}) { - for (size_t imgSizeH : {5, 33, 100}) { - for (size_t imgSizeW : {5, 32, 96}) { - for (size_t size : {1, 2, 3, 5, 7}) { + for (size_t numSamples : {5}) { + for (size_t channels : {1, 5}) { + for (size_t imgSizeH : {5, 33}) { + for (size_t imgSizeW : {5, 32}) { + for (size_t size : {1, 3}) { VLOG(3) << " numSamples=" << numSamples << " channels=" << channels << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW << " size=" << size; @@ -48,11 +48,11 @@ TEST(CrossMapNormal, real) { } TEST(CrossMapNormalGrad, real) { - for (size_t numSamples : {5, 32}) { - for (size_t channels : {1, 5, 32}) { - for (size_t imgSizeH : {5, 33, 100}) { - for (size_t imgSizeW : {5, 32, 96}) { - for (size_t size : {1, 2, 3, 5, 7}) { + for (size_t numSamples : {5}) { + for (size_t channels : {1, 5}) { + for (size_t imgSizeH : {5, 33}) { + for (size_t imgSizeW : {5, 32}) { + for (size_t size : {1, 3}) { VLOG(3) << " numSamples=" << numSamples << " channels=" << channels << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW << " size=" << size; diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu index ede0d27aa82e7d71ff5bc33df110fec260e06463..33463805cbd4746c05548028e0bc4a0e2a90453e 100644 --- a/paddle/function/DepthwiseConvOpGpu.cu +++ b/paddle/function/DepthwiseConvOpGpu.cu @@ -20,17 +20,25 @@ namespace paddle { // CUDA kernel to compute the depthwise convolution forward pass template -__global__ -void ConvolutionDepthwiseForward(const int nthreads, - const T* const inputData, const T* const filterData, - const int batchSize, const int outputChannels, const int outputHeight, - const int outputWidth, const int inputChannels, const int inputHeight, - const int inputWidth, const int filterMultiplier, const int filterHeight, - const int filterWidth, const int strideH, const int strideW, - const int paddingH, const int paddingW, T* const outputData) { - - int index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; +__global__ void ConvolutionDepthwiseForward(const int nthreads, + const T* const inputData, + const T* const filterData, + const int batchSize, + const int outputChannels, + const int outputHeight, + const int outputWidth, + const int inputChannels, + const int inputHeight, + const int inputWidth, + const int filterMultiplier, + const int filterHeight, + const int filterWidth, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + T* const outputData) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < nthreads) { const int batch = index / outputChannels / outputHeight / outputWidth; @@ -45,32 +53,36 @@ void ConvolutionDepthwiseForward(const int nthreads, const int w_in_start = -paddingW + w_out * strideW; const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1; const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1; - if ((h_in_start >= 0) && (h_in_end < inputHeight) - && (w_in_start >= 0) && (w_in_end < inputWidth)) { - for (int kh = 0; kh < filterHeight; ++kh) { - for (int kw = 0; kw < filterWidth; ++kw) { - const int h_in = -paddingH + h_out * strideH + kh; - const int w_in = -paddingW + w_out * strideW + kw; - const int offset = ((batch * inputChannels + c_in) - * inputHeight + h_in) * inputWidth + w_in; - value += (*weight) * inputData[offset]; - ++weight; - } + if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) && + (w_in_end < inputWidth)) { + for (int kh = 0; kh < filterHeight; ++kh) { + for (int kw = 0; kw < filterWidth; ++kw) { + const int h_in = -paddingH + h_out * strideH + kh; + const int w_in = -paddingW + w_out * strideW + kw; + const int offset = + ((batch * inputChannels + c_in) * inputHeight + h_in) * + inputWidth + + w_in; + value += (*weight) * inputData[offset]; + ++weight; } + } } else { - for (int kh = 0; kh < filterHeight; ++kh) { - for (int kw = 0; kw < filterWidth; ++kw) { - const int h_in = -paddingH + h_out * strideH + kh; - const int w_in = -paddingW + w_out * strideW + kw; - if ((h_in >= 0) && (h_in < inputHeight) - && (w_in >= 0) && (w_in < inputWidth)) { - const int offset = ((batch * inputChannels + c_in) - * inputHeight + h_in) * inputWidth + w_in; - value += (*weight) * inputData[offset]; - } - ++weight; - } - } + for (int kh = 0; kh < filterHeight; ++kh) { + for (int kw = 0; kw < filterWidth; ++kw) { + const int h_in = -paddingH + h_out * strideH + kh; + const int w_in = -paddingW + w_out * strideW + kw; + if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) && + (w_in < inputWidth)) { + const int offset = + ((batch * inputChannels + c_in) * inputHeight + h_in) * + inputWidth + + w_in; + value += (*weight) * inputData[offset]; + } + ++weight; + } + } } outputData[index] = value; } @@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads, // CUDA kernel to compute the depthwise convolution backprop w.r.t input. template -__global__ -void ConvolutionDepthwiseInputBackward(const int nthreads, - const T* const top_diff, const T* const weight_data, - const int num, const int outputChannels, const int outputHeight, - const int outputWidth, const int inputChannels, const int inputHeight, - const int inputWidth, const int filterMultiplier, const int filterHeight, - const int filterWidth, const int strideH, const int strideW, - const int paddingH, const int paddingW, T* const bottom_diff) { - int index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; +__global__ void ConvolutionDepthwiseInputBackward(const int nthreads, + const T* const top_diff, + const T* const weight_data, + const int num, + const int outputChannels, + const int outputHeight, + const int outputWidth, + const int inputChannels, + const int inputHeight, + const int inputWidth, + const int filterMultiplier, + const int filterHeight, + const int filterWidth, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + T* const bottom_diff) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < nthreads) { const int batch = index / inputChannels / inputHeight / inputWidth; const int c_in = (index / inputHeight / inputWidth) % inputChannels; @@ -96,65 +117,80 @@ void ConvolutionDepthwiseInputBackward(const int nthreads, const int c_out_start = c_in * filterMultiplier; - int h_out_start = (h_in - filterHeight + paddingH + strideH)/strideH; + int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH; h_out_start = 0 > h_out_start ? 0 : h_out_start; - int h_out_end = (h_in + paddingH)/strideH; - h_out_end = outputHeight - 1 < h_out_end? outputHeight - 1 : h_out_end; - int w_out_start = (w_in - filterWidth + paddingW + strideW)/strideW; + int h_out_end = (h_in + paddingH) / strideH; + h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end; + int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW; w_out_start = 0 > w_out_start ? 0 : w_out_start; - int w_out_end = (w_in + paddingW)/strideW; - w_out_end = outputWidth - 1 < w_out_end? outputWidth - 1 : w_out_end; + int w_out_end = (w_in + paddingW) / strideW; + w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end; T value = 0; - for (int c_out = c_out_start; - c_out < c_out_start + filterMultiplier; c_out ++) { - for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { - const int filter_h = h_in + paddingH - h_out * strideH; - for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { - const int filter_w = w_in + paddingW - w_out * strideW; - const int filter_offset = c_out * filterHeight * filterWidth - + filter_h * filterWidth + filter_w; - const int top_diff_offset = ((batch * outputChannels + c_out) * - outputHeight + h_out)* outputWidth + w_out; - value += top_diff[top_diff_offset] * weight_data[filter_offset]; - } + for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier; + c_out++) { + for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { + const int filter_h = h_in + paddingH - h_out * strideH; + for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { + const int filter_w = w_in + paddingW - w_out * strideW; + const int filter_offset = c_out * filterHeight * filterWidth + + filter_h * filterWidth + filter_w; + const int top_diff_offset = + ((batch * outputChannels + c_out) * outputHeight + h_out) * + outputWidth + + w_out; + value += top_diff[top_diff_offset] * weight_data[filter_offset]; } + } } bottom_diff[index] += value; - } + } } // CUDA kernel to compute the depthwise convolution backprop w.r.t filter. template -__global__ -void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads, - const T* const top_diff, const T* const inputData, - const int num, const int outputChannels, const int outputHeight, - const int outputWidth, const int inputChannels, const int inputHeight, - const int inputWidth, const int filterMultiplier, const int filterHeight, - const int filterWidth, const int strideH, const int strideW, - const int paddingH, const int paddingW, T* const buffer_data) { - int index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; +__global__ void ConvolutionDepthwiseFilterBackward(const int num_i, + const int nthreads, + const T* const top_diff, + const T* const inputData, + const int num, + const int outputChannels, + const int outputHeight, + const int outputWidth, + const int inputChannels, + const int inputHeight, + const int inputWidth, + const int filterMultiplier, + const int filterHeight, + const int filterWidth, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + T* const buffer_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < nthreads) { const int h_out = (index / outputWidth) % outputHeight; const int w_out = index % outputWidth; - const int kh = (index / filterWidth / outputHeight / outputWidth) - % filterHeight; + const int kh = + (index / filterWidth / outputHeight / outputWidth) % filterHeight; const int kw = (index / outputHeight / outputWidth) % filterWidth; const int h_in = -paddingH + h_out * strideH + kh; const int w_in = -paddingW + w_out * strideW + kw; - if ((h_in >= 0) && (h_in < inputHeight) - && (w_in >= 0) && (w_in < inputWidth)) { - const int c_out = index / - (filterHeight * filterWidth * outputHeight * outputWidth); + if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) && + (w_in < inputWidth)) { + const int c_out = + index / (filterHeight * filterWidth * outputHeight * outputWidth); const int c_in = c_out / filterMultiplier; const int batch = num_i; - const int top_offset = ((batch * outputChannels + c_out) * - outputHeight + h_out) * outputWidth + w_out; - const int bottom_offset = ((batch * inputChannels + c_in) - * inputHeight + h_in) * inputWidth + w_in; + const int top_offset = + ((batch * outputChannels + c_out) * outputHeight + h_out) * + outputWidth + + w_out; + const int bottom_offset = + ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth + + w_in; buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset]; } else { buffer_data[index] = 0; @@ -163,170 +199,169 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads, } template -class DepthwiseConvFunctor{ +class DepthwiseConvFunctor { public: void operator()(const T* inputData, - const T* filterData, - int batchSize, - int outputChannels, - int outputHeight, - int outputWidth, - int inputChannels, - int inputHeight, - int inputWidth, - int filterMultiplier, - int filterHeight, - int filterWidth, - int strideH, - int strideW, - int paddingH, - int paddingW, - T* outputData){ + const T* filterData, + int batchSize, + int outputChannels, + int outputHeight, + int outputWidth, + int inputChannels, + int inputHeight, + int inputWidth, + int filterMultiplier, + int filterHeight, + int filterWidth, + int strideH, + int strideW, + int paddingH, + int paddingW, + T* outputData) { int outputSize = batchSize * outputChannels * outputHeight * outputWidth; - size_t blocks = (outputSize + 1024 -1) / 1024; + size_t blocks = (outputSize + 1024 - 1) / 1024; size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; + size_t blockY = (blocks + 512 - 1) / 512; dim3 threads(1024, 1); dim3 grid(blockX, blockY); - ConvolutionDepthwiseForward - <<< grid, threads, 0, STREAM_DEFAULT >>>( - outputSize, - inputData, - filterData, - batchSize, - outputChannels, - outputHeight, - outputWidth, - inputChannels, - inputHeight, - inputWidth, - filterMultiplier, - filterHeight, - filterWidth, - strideH, - strideW, - paddingH, - paddingW, - outputData); - } + ConvolutionDepthwiseForward<<>>( + outputSize, + inputData, + filterData, + batchSize, + outputChannels, + outputHeight, + outputWidth, + inputChannels, + inputHeight, + inputWidth, + filterMultiplier, + filterHeight, + filterWidth, + strideH, + strideW, + paddingH, + paddingW, + outputData); + } }; template -class DepthwiseConvGradInputFunctor{ +class DepthwiseConvGradInputFunctor { public: void operator()(const T* outputGrad, - const T* filterData, - int batchSize, - int outputChannels, - int outputHeight, - int outputWidth, - int inputChannels, - int inputHeight, - int inputWidth, - int filterMultiplier, - int filterHeight, - int filterWidth, - int strideH, - int strideW, - int paddingH, - int paddingW, - T* inputGrad){ + const T* filterData, + int batchSize, + int outputChannels, + int outputHeight, + int outputWidth, + int inputChannels, + int inputHeight, + int inputWidth, + int filterMultiplier, + int filterHeight, + int filterWidth, + int strideH, + int strideW, + int paddingH, + int paddingW, + T* inputGrad) { int inputSize = batchSize * inputChannels * inputHeight * inputWidth; - size_t blocks = (inputSize + 1024 -1) / 1024; + size_t blocks = (inputSize + 1024 - 1) / 1024; size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; + size_t blockY = (blocks + 512 - 1) / 512; dim3 threads(1024, 1); dim3 grid(blockX, blockY); - ConvolutionDepthwiseInputBackward - // NOLINT_NEXT_LINE(whitespace/operators) - <<< grid, threads, 0, STREAM_DEFAULT >>>( - inputSize, - outputGrad, - filterData, - batchSize, - outputChannels, - outputHeight, - outputWidth, - inputChannels, - inputHeight, - inputWidth, - filterMultiplier, - filterHeight, - filterWidth, - strideH, - strideW, - paddingH, - paddingW, - inputGrad); - } + // NOLINT_NEXT_LINE(whitespace/operators) + <<>>(inputSize, + outputGrad, + filterData, + batchSize, + outputChannels, + outputHeight, + outputWidth, + inputChannels, + inputHeight, + inputWidth, + filterMultiplier, + filterHeight, + filterWidth, + strideH, + strideW, + paddingH, + paddingW, + inputGrad); + } }; template class DepthwiseConvGradFilterFunctor { public: void operator()(const T* outputGrad, - const T* inputData, - int batchSize, - int outputChannels, - int outputHeight, - int outputWidth, - int inputChannels, - int inputHeight, - int inputWidth, - int filterMultiplier, - int filterHeight, - int filterWidth, - int strideH, - int strideW, - int paddingH, - int paddingW, - T* colData, - T* filterGrad){ - int colDataSize = outputChannels * filterHeight * filterWidth - * outputHeight * outputWidth; + const T* inputData, + int batchSize, + int outputChannels, + int outputHeight, + int outputWidth, + int inputChannels, + int inputHeight, + int inputWidth, + int filterMultiplier, + int filterHeight, + int filterWidth, + int strideH, + int strideW, + int paddingH, + int paddingW, + T* colData, + T* filterGrad) { + int colDataSize = outputChannels * filterHeight * filterWidth * + outputHeight * outputWidth; - size_t blocks = (colDataSize + 1024 -1) / 1024; - size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; - dim3 threads(1024, 1); - dim3 grid(blockX, blockY); - BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth, - 1, filterGrad, false, true); + size_t blocks = (colDataSize + 1024 - 1) / 1024; + size_t blockX = 512; + size_t blockY = (blocks + 512 - 1) / 512; + dim3 threads(1024, 1); + dim3 grid(blockX, blockY); + BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth, + 1, + filterGrad, + false, + true); - for (int i = 0; i < batchSize; i++) { - ConvolutionDepthwiseFilterBackward - <<< grid, threads, 0, STREAM_DEFAULT >>>( - i, - colDataSize, - outputGrad, - inputData, - batchSize, - outputChannels, - outputHeight, - outputWidth, - inputChannels, - inputHeight, - inputWidth, - filterMultiplier, - filterHeight, - filterWidth, - strideH, - strideW, - paddingH, - paddingW, - colData); - int K = outputHeight * outputWidth; - int M = colDataSize / K; + for (int i = 0; i < batchSize; i++) { + ConvolutionDepthwiseFilterBackward< + T><<>>(i, + colDataSize, + outputGrad, + inputData, + batchSize, + outputChannels, + outputHeight, + outputWidth, + inputChannels, + inputHeight, + inputWidth, + filterMultiplier, + filterHeight, + filterWidth, + strideH, + strideW, + paddingH, + paddingW, + colData); + int K = outputHeight * outputWidth; + int M = colDataSize / K; - BaseMatrix colMatrix(M, K, colData, false, true); - filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0); - } + BaseMatrix colMatrix(M, K, colData, false, true); + filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0); } + } }; #ifdef PADDLE_TYPE_DOUBLE diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp index fdf7e631e5ab8c67eb5cf906bd0af49740d60112..6360a6e023ebd2f97c442c80c8d7f56b5ec4cbf7 100644 --- a/paddle/function/FunctionTest.cpp +++ b/paddle/function/FunctionTest.cpp @@ -24,14 +24,14 @@ void FunctionApi(typename Tensor::Matrix& output, template <> void FunctionApi(CpuMatrix& output, const CpuMatrix& input) { - EXPECT_EQ(output.getHeight(), 100); - EXPECT_EQ(output.getWidth(), 200); + EXPECT_EQ(output.getHeight(), 100U); + EXPECT_EQ(output.getWidth(), 200U); } template <> void FunctionApi(GpuMatrix& output, const GpuMatrix& input) { - EXPECT_EQ(output.getHeight(), 10); - EXPECT_EQ(output.getWidth(), 20); + EXPECT_EQ(output.getHeight(), 10U); + EXPECT_EQ(output.getWidth(), 20U); } template @@ -85,14 +85,14 @@ void testBufferArgs(const BufferArgs& inputs, } void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) { - EXPECT_EQ(inputs.size(), 1); + EXPECT_EQ(inputs.size(), 1U); check(inputs[0]); } TEST(Arguments, Matrix) { MatrixPtr matrix = Matrix::create(100, 200); CheckBufferArg check = [=](const BufferArg& arg) { - EXPECT_EQ(arg.shape().ndims(), 2); + EXPECT_EQ(arg.shape().ndims(), 2U); EXPECT_EQ(arg.shape()[0], 100); EXPECT_EQ(arg.shape()[1], 200); EXPECT_EQ(arg.data(), matrix->getData()); diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 9deb2739fcfff935a98a0b5b31b5d11819d81227..0ada4d70a0c7d13f9b5fb1a42eac07fc4c775a87 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -66,16 +66,23 @@ public: real* inputData = inputs[0].data(); real* filterData = inputs[1].data(); real* outputData = outputs[0].data(); + bool needIm2col = isNeedIm2col(filter); + TensorShape imShape = TensorShape({inputChannels / groups_, inputHeight, inputWidth}); - TensorShape colShape = TensorShape({inputChannels / groups_, - filterHeight, - filterWidth, - outputHeight, - outputWidth}); - resizeBuffer(colShape.getElements()); - real* colData = reinterpret_cast(memory_->getBuf()); + TensorShape colShape; + real* colData = NULL; + + if (needIm2col) { + colShape = TensorShape({inputChannels / groups_, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + resizeBuffer(colShape.getElements()); + colData = reinterpret_cast(memory_->getBuf()); + } Im2ColFunctor im2col; GemmFunctor gemm; @@ -86,15 +93,18 @@ public: for (size_t i = 0; i < batchSize; i++) { for (size_t g = 0; g < groups_; g++) { - im2col(inputData + g * inputOffset, - imShape, - colData, - colShape, - strideH(), - strideW(), - paddingH(), - paddingW()); - + if (needIm2col) { + im2col(inputData + g * inputOffset, + imShape, + colData, + colShape, + strideH(), + strideW(), + paddingH(), + paddingW()); + } else { + colData = inputData + g * inputOffset; + } int M = outputChannels / groups_; int N = outputHeight * outputWidth; int K = inputChannels / groups_ * filterHeight * filterWidth; @@ -159,19 +169,27 @@ public: real* outputGrad = inputs[0].data(); real* filterData = inputs[1].data(); real* inputGrad = outputs[0].data(); + bool needIm2col = isNeedIm2col(filter); + TensorShape imShape = TensorShape({inputChannels / groups_, inputHeight, inputWidth}); - TensorShape colShape = TensorShape({inputChannels / groups_, - filterHeight, - filterWidth, - outputHeight, - outputWidth}); - resizeBuffer(colShape.getElements()); - real* colData = reinterpret_cast(memory_->getBuf()); + TensorShape colShape; + real* colData = NULL; + + if (needIm2col) { + colShape = TensorShape({inputChannels / groups_, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + resizeBuffer(colShape.getElements()); + colData = reinterpret_cast(memory_->getBuf()); + } Col2ImFunctor col2im; GemmFunctor gemm; + size_t inputOffset = imShape.getElements(); size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; @@ -182,6 +200,11 @@ public: int K = outputChannels / groups_; int N = outputHeight * outputWidth; int M = inputChannels / groups_ * filterHeight * filterWidth; + real scale = 0.0f; + if (!needIm2col) { + colData = inputGrad + g * inputOffset; + scale = 1.0f; + } gemm(CblasTrans, CblasNoTrans, M, @@ -192,17 +215,19 @@ public: M, outputGrad + g * outputOffset, N, - 0.0f, + scale, colData, N); - col2im(inputGrad + g * inputOffset, - imShape, - colData, - colShape, - strideH(), - strideW(), - paddingH(), - paddingW()); + if (needIm2col) { + col2im(inputGrad + g * inputOffset, + imShape, + colData, + colShape, + strideH(), + strideW(), + paddingH(), + paddingW()); + } } inputGrad += inputChannels * inputHeight * inputWidth; outputGrad += outputChannels * outputHeight * outputWidth; @@ -255,16 +280,23 @@ public: real* outputGrad = inputs[0].data(); real* inputData = inputs[1].data(); real* filterGrad = outputs[0].data(); + bool needIm2col = isNeedIm2col(filter); + TensorShape imShape = TensorShape({inputChannels / groups_, inputHeight, inputWidth}); - TensorShape colShape = TensorShape({inputChannels / groups_, - filterHeight, - filterWidth, - outputHeight, - outputWidth}); - resizeBuffer(colShape.getElements()); - real* colData = reinterpret_cast(memory_->getBuf()); + TensorShape colShape; + real* colData = NULL; + + if (needIm2col) { + colShape = TensorShape({inputChannels / groups_, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + resizeBuffer(colShape.getElements()); + colData = reinterpret_cast(memory_->getBuf()); + } Im2ColFunctor im2col; GemmFunctor gemm; @@ -274,15 +306,18 @@ public: size_t filterOffset = filter.getElements() / groups_; for (size_t i = 0; i < batchSize; i++) { for (size_t g = 0; g < groups_; g++) { - im2col(inputData + g * inputOffset, - imShape, - colData, - colShape, - strideH(), - strideW(), - paddingH(), - paddingW()); - + if (needIm2col) { + im2col(inputData + g * inputOffset, + imShape, + colData, + colShape, + strideH(), + strideW(), + paddingH(), + paddingW()); + } else { + colData = inputData + g * inputOffset; + } int M = outputChannels / groups_; int K = outputHeight * outputWidth; int N = inputChannels / groups_ * filterHeight * filterWidth; diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu index 15ba854009636d027447d104071163100d5e3f4b..bd98610498b1af003574129118be4684d38e5813 100644 --- a/paddle/function/Im2ColOpGpu.cu +++ b/paddle/function/Im2ColOpGpu.cu @@ -17,16 +17,21 @@ limitations under the License. */ namespace paddle { -template -__global__ -void im2col(const T* data_im, int numOuts, int height, int width, - int blockH, int blockW, - int strideH, int strideW, - int paddingH, int paddingW, - int height_col, int width_col, - T* data_col) { - int index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; +template +__global__ void im2col(const T* data_im, + int numOuts, + int height, + int width, + int blockH, + int blockW, + int strideH, + int strideW, + int paddingH, + int paddingW, + int height_col, + int width_col, + T* data_col) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < numOuts) { int w_out = index % width_col; index /= width_col; @@ -39,17 +44,17 @@ void im2col(const T* data_im, int numOuts, int height, int width, data_col += (channel_out * height_col + h_out) * width_col + w_out; for (int i = 0; i < blockH; ++i) { for (int j = 0; j < blockW; ++j) { - int rIdx = int(h_in+i); - int cIdx = int(w_in+j); - if ((rIdx-(int)paddingH) >= (int)height || - (rIdx-(int)paddingH) < 0 || - (cIdx-(int)paddingW) >= (int)width || - (cIdx-(int)paddingW) < 0) { + int rIdx = int(h_in + i); + int cIdx = int(w_in + j); + if ((rIdx - (int)paddingH) >= (int)height || + (rIdx - (int)paddingH) < 0 || + (cIdx - (int)paddingW) >= (int)width || + (cIdx - (int)paddingW) < 0) { *data_col = 0; } else { - rIdx = rIdx + channel_in*height - paddingH; + rIdx = rIdx + channel_in * height - paddingH; cIdx = cIdx - paddingW; - *data_col = data_im[rIdx* width + cIdx]; + *data_col = data_im[rIdx * width + cIdx]; } data_col += height_col * width_col; } @@ -82,60 +87,73 @@ public: int outputWidth = colShape[4]; int numKernels = inputChannels * outputHeight * outputWidth; - int blocks = (numKernels + 1024 -1) / 1024; + int blocks = (numKernels + 1024 - 1) / 1024; int blockX = 512; int blockY = (blocks + 512 - 1) / 512; dim3 threads(1024, 1); dim3 grid(blockX, blockY); - im2col<<< grid, threads, 0, STREAM_DEFAULT >>> - (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth, - strideHeight, strideWidth, paddingHeight, paddingWidth, - outputHeight, outputWidth, colData); + im2col<<>>(imData, + numKernels, + inputHeight, + inputWidth, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth, + colData); CHECK_SYNC("Im2ColFunctor GPU failed"); } }; -template -__global__ -void col2im(size_t n, const T* data_col, size_t height, - size_t width, size_t channels, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t height_col, size_t width_col, - T* data_im) { +template +__global__ void col2im(size_t n, + const T* data_col, + size_t height, + size_t width, + size_t channels, + size_t blockH, + size_t blockW, + size_t strideH, + size_t strideW, + size_t paddingH, + size_t paddingW, + size_t height_col, + size_t width_col, + T* data_im) { size_t index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < n) { T val = 0; int w = int(index % width); int h = int((index / width) % height); int c = int(index / (width * height)); if ((w - (int)paddingW) >= 0 && - (w - (int)paddingW) < (width-2 * paddingW) && - (h - (int)paddingH) >= 0 && - (h - paddingH) < (height - 2 * paddingH)) { + (w - (int)paddingW) < (width - 2 * paddingW) && + (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) { // compute the start and end of the output int w_col_start = - (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; - int w_col_end = - min((int)(w / (int)strideW + 1), (int)(width_col)); + (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; + int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col)); int h_col_start = - (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; + (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; int h_col_end = min(int(h / strideH + 1), int(height_col)); for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { // the col location: [c * width * height + h_out, w_out] - int c_col = int(c * blockH* blockW) + \ - (h - h_col * (int)strideH) * (int)blockW + - (w - w_col * (int)strideW); + int c_col = int(c * blockH * blockW) + + (h - h_col * (int)strideH) * (int)blockW + + (w - w_col * (int)strideW); val += data_col[(c_col * height_col + h_col) * width_col + w_col]; } } h -= paddingH; w -= paddingW; - data_im[c*((width-2*paddingW) * (height-2*paddingH)) + - h*(width-2*paddingW) + w] += val; + data_im[c * ((width - 2 * paddingW) * (height - 2 * paddingH)) + + h * (width - 2 * paddingW) + w] += val; } } } @@ -164,32 +182,32 @@ public: int outputHeight = colShape[3]; int outputWidth = colShape[4]; - size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight) - * (inputWidth + 2*paddingWidth); + size_t numKernels = inputChannels * (inputHeight + 2 * paddingHeight) * + (inputWidth + 2 * paddingWidth); - size_t blocks = (numKernels + 1024 -1) / 1024; + size_t blocks = (numKernels + 1024 - 1) / 1024; size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; + size_t blockY = (blocks + 512 - 1) / 512; dim3 threads(1024, 1); dim3 grid(blockX, blockY); // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. - col2im<<< grid, threads, 0, STREAM_DEFAULT >>> - (numKernels, - colData, - inputHeight + 2*paddingHeight, - inputWidth + 2*paddingWidth, - inputChannels, - filterHeight, - filterWidth, - strideHeight, - strideWidth, - paddingHeight, - paddingWidth, - outputHeight, - outputWidth, - imData); + col2im<<>>( + numKernels, + colData, + inputHeight + 2 * paddingHeight, + inputWidth + 2 * paddingWidth, + inputChannels, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth, + imData); CHECK_SYNC("Col2ImFunctor GPU failed"); } }; @@ -199,31 +217,35 @@ template class Im2ColFunctor; template class Col2ImFunctor; template class Col2ImFunctor; -template -__global__ -void im2colOCF(const T* imData, T* colData, - int inputChannels, - int inputHeight, int inputWidth, - int filterHeight, int filterWidth, - int strideHeight, int strideWidth, - int paddingHeight, int paddingWidth, - int outputHeight, int outputWidth) { +template +__global__ void im2colOCF(const T* imData, + T* colData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth) { int swId = blockIdx.x; int shId = blockIdx.y; - for (int channelId = threadIdx.z; - channelId < inputChannels; + for (int channelId = threadIdx.z; channelId < inputChannels; channelId += blockDim.z) { for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { int widthOffset = idx + swId * strideWidth - paddingWidth; int heightOffset = idy + shId * strideHeight - paddingHeight; - int imOffset = widthOffset + heightOffset * inputWidth - + channelId * inputHeight * inputWidth; + int imOffset = widthOffset + heightOffset * inputWidth + + channelId * inputHeight * inputWidth; - int colOffset = idx + idy * filterWidth - + channelId * filterHeight * filterWidth - + (shId * outputWidth + swId) - * (inputChannels * filterHeight * filterWidth); + int colOffset = idx + idy * filterWidth + + channelId * filterHeight * filterWidth + + (shId * outputWidth + swId) * + (inputChannels * filterHeight * filterWidth); if (heightOffset >= inputHeight || heightOffset < 0 || widthOffset >= inputWidth || widthOffset < 0) { @@ -279,39 +301,52 @@ public: int blockDimZ = 1024 / blockDimX / blockDimY; dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels)); dim3 grid(outputWidth, outputHeight); - im2colOCF<<< grid, threads, 0, STREAM_DEFAULT >>> - (imData, colData, inputChannels, inputHeight, inputWidth, - filterHeight, filterWidth, strideHeight, strideWidth, - paddingHeight, paddingWidth, outputHeight, outputWidth); + im2colOCF<<>>(imData, + colData, + inputChannels, + inputHeight, + inputWidth, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth); CHECK_SYNC("Im2ColFunctor GPU failed"); } }; -template -__global__ -void col2imOCF(T* imData, const T* colData, - int inputChannels, - int inputHeight, int inputWidth, - int filterHeight, int filterWidth, - int strideHeight, int strideWidth, - int paddingHeight, int paddingWidth, - int outputHeight, int outputWidth) { +template +__global__ void col2imOCF(T* imData, + const T* colData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth) { int swId = blockIdx.x; int shId = blockIdx.y; - for (int channelId = threadIdx.z; - channelId < inputChannels; + for (int channelId = threadIdx.z; channelId < inputChannels; channelId += blockDim.z) { for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { int widthOffset = idx + swId * strideWidth - paddingWidth; int heightOffset = idy + shId * strideHeight - paddingHeight; - int imOffset = widthOffset + heightOffset * inputWidth - + channelId * inputHeight * inputWidth; + int imOffset = widthOffset + heightOffset * inputWidth + + channelId * inputHeight * inputWidth; - int colOffset = idx + idy * filterWidth - + channelId * filterHeight * filterWidth - + (shId * outputWidth + swId) - * (inputChannels * filterHeight * filterWidth); + int colOffset = idx + idy * filterWidth + + channelId * filterHeight * filterWidth + + (shId * outputWidth + swId) * + (inputChannels * filterHeight * filterWidth); if (heightOffset >= 0 && heightOffset < inputHeight && widthOffset >= 0 && widthOffset < inputWidth) { @@ -365,10 +400,19 @@ public: int blockDimZ = 1024 / blockDimX / blockDimY; dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels)); dim3 grid(outputWidth, outputHeight); - col2imOCF<<< grid, threads, 0, STREAM_DEFAULT >>> - (imData, colData, inputChannels, inputHeight, inputWidth, - filterHeight, filterWidth, strideHeight, strideWidth, - paddingHeight, paddingWidth, outputHeight, outputWidth); + col2imOCF<<>>(imData, + colData, + inputChannels, + inputHeight, + inputWidth, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth); CHECK_SYNC("Col2ImFunctor GPU failed"); } }; diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu index dcfcb2325d7dae22e0e0e78fc0bddf061fc0940c..9449b89056b4b1740cb4c3de630348b1b361d61e 100644 --- a/paddle/function/MulOpGpu.cu +++ b/paddle/function/MulOpGpu.cu @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "MulOp.h" +#include "hl_base.h" #include "paddle/math/Matrix.h" #include "paddle/math/SparseMatrix.h" diff --git a/paddle/function/PadOpGpu.cu b/paddle/function/PadOpGpu.cu index 9094f1528433fdcaad3397a991aa8ac6fa04bc01..5b6f4e6832aea4bcfe22e530f5f25ef5815729f1 100644 --- a/paddle/function/PadOpGpu.cu +++ b/paddle/function/PadOpGpu.cu @@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "PadOp.h" +#include "hl_base.h" namespace paddle { -__global__ void KePad(real* outputs, const real* inputs, - int inC, int inH, int inW, - int padc, int padh, int padw, - int outC, int outH, int outW, int nthreads) { +__global__ void KePad(real* outputs, + const real* inputs, + int inC, + int inH, + int inW, + int padc, + int padh, + int padw, + int outC, + int outH, + int outW, + int nthreads) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < nthreads) { const int w = idx % inW; @@ -50,16 +58,33 @@ void Pad(real* outputs, int outC = inC + cstart + cend; int outH = inH + hstart + hend; int outW = inW + wstart + wend; - KePad<<>> - (outputs, inputs, inC, inH, inW, cstart, hstart, wstart, - outC, outH, outW, nth); + KePad<<>>(outputs, + inputs, + inC, + inH, + inW, + cstart, + hstart, + wstart, + outC, + outH, + outW, + nth); CHECK_SYNC("Pad"); } -__global__ void KePadDiff(real* inGrad, const real* outGrad, - int inC, int inH, int inW, - int padc, int padh, int padw, - int outC, int outH, int outW, int nthreads) { +__global__ void KePadDiff(real* inGrad, + const real* outGrad, + int inC, + int inH, + int inW, + int padc, + int padh, + int padw, + int outC, + int outH, + int outW, + int nthreads) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < nthreads) { const int w = idx % inW; @@ -89,9 +114,18 @@ void PadGrad(real* inGrad, int outC = inC + cstart + cend; int outH = inH + hstart + hend; int outW = inW + wstart + wend; - KePadDiff <<>> - (inGrad, outGrad, inC, inH, inW, cstart, hstart, wstart, - outC, outH, outW, nth); + KePadDiff<<>>(inGrad, + outGrad, + inC, + inH, + inW, + cstart, + hstart, + wstart, + outC, + outH, + outW, + nth); CHECK_SYNC("PadGrad"); } diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu index d9dcc7d59d1e3c222f5a7ce448daa8d7edb6c978..b0cbd9fd1df9a35d6cc1cb5312099d8b45197944 100644 --- a/paddle/function/RowConvOpGpu.cu +++ b/paddle/function/RowConvOpGpu.cu @@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "RowConvOp.h" +#include "hl_base.h" namespace paddle { -template -__global__ void KeRowConv(real* y, const real* x, const real* w, - const int* starts, const int height, const int width, - const int numSeq, const int context) { - +template +__global__ void KeRowConv(real* y, + const real* x, + const real* w, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -30,7 +34,7 @@ __global__ void KeRowConv(real* y, const real* x, const real* w, __shared__ real sw[BLOCK_H][BLOCK_W]; for (int i = tidy; i < context; i += blky) { - sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0; + sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0; } __syncthreads(); @@ -56,9 +60,14 @@ __global__ void KeRowConv(real* y, const real* x, const real* w, } } -__global__ void KeRowConv2(real* y, const real* x, const real* w, - const int* starts, const int height, const int width, - const int numSeq, const int context) { +__global__ void KeRowConv2(real* y, + const real* x, + const real* w, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -84,8 +93,6 @@ __global__ void KeRowConv2(real* y, const real* x, const real* w, } } - - template <> void RowConv(GpuMatrix& out, const GpuMatrix& in, @@ -105,21 +112,24 @@ void RowConv(GpuMatrix& out, dim3 dimGrid(DIVUP(width, dimBlock.x), 1); if (contextLength <= 32) { - KeRowConv<32, 32><<>> - (y, x, w, starts, height, width, numSeq, contextLength); + KeRowConv<32, 32><<>>( + y, x, w, starts, height, width, numSeq, contextLength); } else { - KeRowConv2<<>> - (y, x, w, starts, height, width, numSeq, contextLength); + KeRowConv2<<>>( + y, x, w, starts, height, width, numSeq, contextLength); } CHECK_SYNC("RowConv"); } - -template -__global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, - const int* starts, const int height, const int width, const int numSeq, - const int context) { - +template +__global__ void KeRowConvBwWeight(real* dw, + const real* x, + const real* dy, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -138,21 +148,21 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, const int start = starts[i]; const int end = starts[i + 1]; const int steps = end - start; - const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H; + const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H; for (int j = tidy; j < size; j += BLOCK_H) { int xoff = gidx + tidx; int yoff = start + j; // transpose - sh_x[tidx][tidy] = (xoff < width && yoff < end) ? - x[yoff * width + xoff] : 0.0; - sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? - dy[yoff * width + xoff] : 0.0; + sh_x[tidx][tidy] = + (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0; + sh_dy[tidx][tidy + context - 1] = + (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0; __syncthreads(); if (tidy < (context - 1)) { yoff = yoff - context + 1; - sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? - dy[yoff * width + xoff] : 0.0; + sh_dy[tidx][tidy] = + (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0; } __syncthreads(); @@ -179,11 +189,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, } } -template -__global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, - const int* starts, const int height, const int width, const int numSeq, - const int context) { - +template +__global__ void KeRowConvBwWeight2(real* dw, + const real* x, + const real* dy, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int gidx = blockIdx.x * blockDim.x; @@ -196,19 +210,21 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, const int end = starts[i + 1]; const int steps = end - start; - const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H; + const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H; for (int j = tidy; j < size; j += BLOCK_H) { int xoff = gidx + tidx; int yoff = start + j; // transpose - sh_x[tidx][tidy] = (xoff < width && yoff < end) ? - x[yoff * width + xoff] : 0.0; + sh_x[tidx][tidy] = + (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0; __syncthreads(); for (int t = 0; t < context; t++) { - sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && - yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0; + sh_dy[tidx][tidy] = + (xoff < width && (yoff - t) >= start && yoff - t < end) + ? dy[(yoff - t) * width + xoff] + : 0.0; __syncthreads(); real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx]; @@ -222,18 +238,22 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, __syncthreads(); if (tidx == 0 && (gidx + tidy) < width) { - dw[t*width + gidx + tidy] += val; + dw[t * width + gidx + tidy] += val; } } } } } -template -__global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, - const int* starts, const int height, const int width, const int numSeq, - const int context) { - +template +__global__ void KeRowConvBwData(real* dx, + const real* w, + const real* dy, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -242,7 +262,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, __shared__ real sw[BLOCK_H][BLOCK_W]; for (int i = tidy; i < context; i += blky) { - sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0; + sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0; } __syncthreads(); @@ -266,10 +286,14 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, } } -__global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy, - const int* starts, const int height, const int width, const int numSeq, - const int context) { - +__global__ void KeRowConvBwData2(real* dx, + const real* w, + const real* dy, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -295,14 +319,13 @@ __global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy, } } - template <> void RowConvGrad(const GpuMatrix& outG, - const GpuMatrix& in, - const GpuMatrix& filter, - GpuMatrix& inG, - GpuMatrix& filterG, - const GpuIVector& seq) { + const GpuMatrix& in, + const GpuMatrix& filter, + GpuMatrix& inG, + GpuMatrix& filterG, + const GpuIVector& seq) { const size_t numSeq = seq.getSize() - 1; const size_t contextLength = filter.getHeight(); const size_t height = in.getHeight(); @@ -318,13 +341,11 @@ void RowConvGrad(const GpuMatrix& outG, dim3 dimGrid(DIVUP(width, dimBlock.x), 1); real* dw = filterG.getData(); if (contextLength <= 32) { - KeRowConvBwWeight<32, 32, 32> - <<>> - (dw, x, dy, starts, height, width, numSeq, contextLength); + KeRowConvBwWeight<32, 32, 32><<>>( + dw, x, dy, starts, height, width, numSeq, contextLength); } else { - KeRowConvBwWeight2<32, 32> - <<>> - (dw, x, dy, starts, height, width, numSeq, contextLength); + KeRowConvBwWeight2<32, 32><<>>( + dw, x, dy, starts, height, width, numSeq, contextLength); } } @@ -333,13 +354,11 @@ void RowConvGrad(const GpuMatrix& outG, dim3 dimBlock2(32, 32); dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1); if (contextLength <= 64) { - KeRowConvBwData<32, 64> - <<>> - (dx, w, dy, starts, height, width, numSeq, contextLength); + KeRowConvBwData<32, 64><<>>( + dx, w, dy, starts, height, width, numSeq, contextLength); } else { - KeRowConvBwData2 - <<>> - (dx, w, dy, starts, height, width, numSeq, contextLength); + KeRowConvBwData2<<>>( + dx, w, dy, starts, height, width, numSeq, contextLength); } } diff --git a/paddle/function/TensorShapeTest.cpp b/paddle/function/TensorShapeTest.cpp index 45a2e106e7fc3f0e9e57cf8c2bb549d747f4f49b..e5c698237706e7210d3045bbfd0088af58db2954 100644 --- a/paddle/function/TensorShapeTest.cpp +++ b/paddle/function/TensorShapeTest.cpp @@ -19,35 +19,35 @@ namespace paddle { TEST(TensorShape, Constructor) { TensorShape t1; - EXPECT_EQ(t1.ndims(), 0); - EXPECT_EQ(t1.getElements(), 0); + EXPECT_EQ(t1.ndims(), 0U); + EXPECT_EQ(t1.getElements(), 0U); TensorShape t2(3); - EXPECT_EQ(t2.ndims(), 3); - EXPECT_EQ(t2.getElements(), 1); + EXPECT_EQ(t2.ndims(), 3U); + EXPECT_EQ(t2.getElements(), 1U); TensorShape t3({8, 10}); - EXPECT_EQ(t3.ndims(), 2); - EXPECT_EQ(t3.getElements(), 80); + EXPECT_EQ(t3.ndims(), 2U); + EXPECT_EQ(t3.getElements(), 80U); TensorShape t4(t3); EXPECT_EQ(t4.ndims(), t3.ndims()); EXPECT_EQ(t4.getElements(), t3.getElements()); TensorShape t5({1, 2, 3, 4, 5}); - EXPECT_EQ(t5.ndims(), 5); - EXPECT_EQ(t5.getElements(), 120); + EXPECT_EQ(t5.ndims(), 5U); + EXPECT_EQ(t5.getElements(), 120U); } TEST(TensorShape, GetAndSet) { TensorShape t({1, 2, 3}); - EXPECT_EQ(t.ndims(), 3); - EXPECT_EQ(t.getElements(), 6); + EXPECT_EQ(t.ndims(), 3U); + EXPECT_EQ(t.getElements(), 6U); EXPECT_EQ(t[1], 2); t.setDim(1, 100); - EXPECT_EQ(t.getElements(), 300); - EXPECT_EQ(t[1], 100); + EXPECT_EQ(t.getElements(), 300U); + EXPECT_EQ(t[1], 100U); } } // namespace paddle diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/function/TensorTypeTest.cpp index e50e46f3e99111731d9587f3e4ddfd4b26ae27e9..d1c559a91e294853fa6e19f9115bc008ae56915c 100644 --- a/paddle/function/TensorTypeTest.cpp +++ b/paddle/function/TensorTypeTest.cpp @@ -19,9 +19,9 @@ namespace paddle { TEST(TensorType, Matrix) { Tensor::Matrix matrix(100, 200); - EXPECT_EQ(matrix.getHeight(), 100); - EXPECT_EQ(matrix.getWidth(), 200); - EXPECT_EQ(matrix.getElementCnt(), 100 * 200); + EXPECT_EQ(matrix.getHeight(), 100U); + EXPECT_EQ(matrix.getWidth(), 200U); + EXPECT_EQ(matrix.getElementCnt(), 100U * 200U); EXPECT_EQ(matrix.useGpu(), false); Tensor::Matrix testGpu(100, 200); @@ -33,15 +33,15 @@ TEST(TensorType, Vector) { Tensor::Vector gpuVector(100); EXPECT_EQ(cpuVector.useGpu(), false); EXPECT_EQ(gpuVector.useGpu(), true); - EXPECT_EQ(cpuVector.getSize(), 100); - EXPECT_EQ(gpuVector.getSize(), 100); + EXPECT_EQ(cpuVector.getSize(), 100U); + EXPECT_EQ(gpuVector.getSize(), 100U); Tensor::Vector cpuIVector(100); Tensor::Vector gpuIVector(100); EXPECT_EQ(cpuIVector.useGpu(), false); EXPECT_EQ(gpuIVector.useGpu(), true); - EXPECT_EQ(cpuIVector.getSize(), 100); - EXPECT_EQ(gpuIVector.getSize(), 100); + EXPECT_EQ(cpuIVector.getSize(), 100U); + EXPECT_EQ(gpuIVector.getSize(), 100U); } TEST(TensorType, EmptyMatrix) { diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp index 81cc3c890b6d4ad048e4edc03208c85778244078..5de2170877ed6f6c70c5617918ad2c4e3b3ed2ee 100644 --- a/paddle/gserver/activations/ActivationFunction.cpp +++ b/paddle/gserver/activations/ActivationFunction.cpp @@ -186,7 +186,10 @@ Error __must_check forward(Argument& act) { useGpu(act.deviceId)); } - auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId)); + auto starts = + act.hasSubseq() + ? act.subSequenceStartPositions->getVector(useGpu(act.deviceId)) + : act.sequenceStartPositions->getVector(useGpu(act.deviceId)); act.value->sequenceSoftmax(*act.value, *starts); return Error(); } @@ -197,8 +200,9 @@ Error __must_check backward(Argument& act) { "Input width for each timestep of sequence softmax should be 1"); } - size_t numSequences = act.getNumSequences(); - const int* starts = act.sequenceStartPositions->getData(false); + size_t numSequences = + act.hasSubseq() ? act.getNumSubSequences() : act.getNumSequences(); + const int* starts = act.getCpuStartPositions(); for (size_t i = 0; i < numSequences; ++i) { // TODO(Dangqingqing) optimization for GPU diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp index 9ddd449de7500f5682d59469328f06971c6e83bf..f98bf95064fa539b990309dfe0bff10c1e99d096 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp @@ -967,8 +967,9 @@ void RecurrentGradientMachine::generateSequence() { size_t numSequences = getGenBatchSize(); resizeBootFrame(numSequences); - // We create only two sub-network in generation for alternate use. - // Thus, we can reduce total memory of output_ in layer forward. + // We create only two sub-network in generation, one stores states of all + // layers in previous time step and the other storing the states at current + // time step. resizeOrCreateFrames(2); // outFrameLines_.size() > 1UL @@ -1001,10 +1002,9 @@ void RecurrentGradientMachine::generateSequence() { // init outArg size_t resultNum = generator_.config.num_results_per_sample(); - IVector::resizeOrCreate( - generator_.outArg.ids, - generator_.config.max_num_frames() * numSequences * resultNum, - false); + size_t maxGenWordCount = + generator_.config.max_num_frames() * numSequences * resultNum; + IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false); if (resultNum > 1) { CHECK_LE(resultNum, static_cast(generator_.config.beam_size())); Matrix::resizeOrCreate(generator_.outArg.in, @@ -1012,6 +1012,11 @@ void RecurrentGradientMachine::generateSequence() { /* width */ resultNum, false, /* useGpu */ false); + Matrix::resizeOrCreate(generator_.outArg.value, + /* height */ maxGenWordCount, + /* width */ 1, + false, + /* useGpu */ false); } ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions, numSequences + 1, @@ -1313,13 +1318,20 @@ void RecurrentGradientMachine::fillGenOutputs() { starts[0] = 0; if (numResults > 1) { real* probs = generator_.outArg.in->getData(); + real* idsProb = generator_.outArg.value->getData(); + size_t curPos = 0; for (size_t i = 0; i < finalPaths_.size(); ++i) { for (size_t j = 0; j < finalPaths_[i].size(); ++j) { Path& path = finalPaths_[i][j]; - generator_.ids.push_back(path.ids.size()); // sequence size + size_t genLen = path.ids.size(); + generator_.ids.push_back(genLen); // sequence size generator_.ids.insert( generator_.ids.end(), path.ids.begin(), path.ids.end()); generator_.ids.push_back(-1); // end of sequence + + memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen); + curPos += genLen; + idsProb[curPos++] = -1.0; probs[i * numResults + j] = path.logProb; if (!j && dataArgsSize_) { diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h index f245620cf668bb341df99cf498105cbd996a6b24..fb3fc5877ac96323e891f800db80af83b6809831 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h @@ -189,6 +189,11 @@ public: */ std::vector ids; + /** + * @brief idsProb, log probability of each generated words. + */ + std::vector idsProb; + /** * @brief logProb, current probability of path. */ @@ -228,11 +233,13 @@ public: */ Path(Path& old, int newId, real logProb, int machineId, int topIndex) : ids(old.ids), + idsProb(old.idsProb), logProb(old.logProb + logProb), machineId(machineId), topIndex(topIndex), seqId(old.seqId) { ids.push_back(newId); + idsProb.push_back(logProb); if (!old.probHistory.empty()) { this->probHistory = old.probHistory; // probHistory store current prob, not sum @@ -411,8 +418,9 @@ protected: struct Generator { GeneratorConfig config; - std::vector ids; // store generated sequences - Argument outArg; // final output argument + std::vector ids; // store generated sequences + std::vector idsProb; // log probability of each generated word + Argument outArg; // final output argument }; bool generating_; Generator generator_; diff --git a/paddle/gserver/layers/ClipLayer.cpp b/paddle/gserver/layers/ClipLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..13f16c953793b82183237188b56eb61d76ecd2fd --- /dev/null +++ b/paddle/gserver/layers/ClipLayer.cpp @@ -0,0 +1,79 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Layer.h" + +namespace paddle { + +/** + * A layer for clipping the input value by the threshold. + * \f[ + * out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right) + * \f] + */ + +class ClipLayer : public Layer { +protected: + double min_; + double max_; + +public: + explicit ClipLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + void backward(const UpdateCallback& callback = nullptr) override; +}; + +REGISTER_LAYER(clip, ClipLayer); + +bool ClipLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + + CHECK_EQ(inputLayers_.size(), 1U); + auto layerConf = config_.inputs(0).clip_conf(); + min_ = layerConf.min(); + max_ = layerConf.max(); + CHECK_LT(min_, max_); + return true; +} + +void ClipLayer::forward(PassType passType) { + Layer::forward(passType); + + MatrixPtr inV = getInputValue(0); + resetOutput(inV->getHeight(), inV->getWidth()); + MatrixPtr outV = getOutputValue(); + outV->copyFrom(*inV); + outV->clip(min_, max_); +} + +void ClipLayer::backward(const UpdateCallback& callback) { + MatrixPtr inV = getInputValue(0); + MatrixPtr inG = getInputGrad(0); + if (inG) { + MatrixPtr outV = getOutputValue(); + MatrixPtr outG = getOutputGrad(); + MatrixPtr tmpMtx; + Matrix::resizeOrCreate( + tmpMtx, outG->getHeight(), outG->getWidth(), false, useGpu_); + tmpMtx->clipDerivative(*inV, min_, max_); + inG->addDotMul(*outG, *tmpMtx, 1, 1); + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/GruCompute.cu b/paddle/gserver/layers/GruCompute.cu index d5e547dce347c824f959425551afea66dfd94e5a..b4f5c54b14767586cb7b7e2c86cc069e2063ccfd 100644 --- a/paddle/gserver/layers/GruCompute.cu +++ b/paddle/gserver/layers/GruCompute.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "GruCompute.h" #include "hl_recurrent_apply.cuh" @@ -31,8 +30,10 @@ void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) { } template <> -void GruCompute::backward<1>(hl_gru_value value, hl_gru_grad grad, - int frameSize, int batchSize) { +void GruCompute::backward<1>(hl_gru_value value, + hl_gru_grad grad, + int frameSize, + int batchSize) { hl_gpu_gru_backward(hppl::backward::gru_stateGrad(), hppl::backward::gru_resetGrad(), value, diff --git a/paddle/gserver/layers/LstmCompute.cu b/paddle/gserver/layers/LstmCompute.cu index f75c0c40ccc833e35f8fe8f21c12b3d3f68d5eb6..d3f59b52a4b3163f47a969d9a08ecd139a099e33 100644 --- a/paddle/gserver/layers/LstmCompute.cu +++ b/paddle/gserver/layers/LstmCompute.cu @@ -12,41 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "LstmCompute.h" #include "hl_recurrent_apply.cuh" namespace paddle { template <> -void LstmCompute::forwardBatch<1>(hl_lstm_value value, int frameSize, - int batchSize) { - hl_gpu_lstm_forward(hppl::forward::lstm(), value, frameSize, - batchSize, activeNode_, activeGate_, +void LstmCompute::forwardBatch<1>(hl_lstm_value value, + int frameSize, + int batchSize) { + hl_gpu_lstm_forward(hppl::forward::lstm(), + value, + frameSize, + batchSize, + activeNode_, + activeGate_, activeState_); } template <> -void LstmCompute::backwardBatch<1>(hl_lstm_value value, hl_lstm_grad grad, - int frameSize, int batchSize) { - hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad, - frameSize, batchSize, activeNode_, - activeGate_, activeState_); +void LstmCompute::backwardBatch<1>(hl_lstm_value value, + hl_lstm_grad grad, + int frameSize, + int batchSize) { + hl_gpu_lstm_backward(hppl::backward::lstm(), + value, + grad, + frameSize, + batchSize, + activeNode_, + activeGate_, + activeState_); } template <> void LstmCompute::forwardOneSequence<1>(hl_lstm_value value, int frameSize) { - hl_gpu_lstm_forward(hppl::forward::lstm(), value, - frameSize, /* batchSize */ 1, - activeNode_, activeGate_, activeState_); + hl_gpu_lstm_forward(hppl::forward::lstm(), + value, + frameSize, + /* batchSize */ 1, + activeNode_, + activeGate_, + activeState_); } template <> -void LstmCompute::backwardOneSequence<1>(hl_lstm_value value, hl_lstm_grad grad, +void LstmCompute::backwardOneSequence<1>(hl_lstm_value value, + hl_lstm_grad grad, int frameSize) { - hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad, - frameSize, /* batchSize */ 1, - activeNode_, activeGate_, activeState_); + hl_gpu_lstm_backward(hppl::backward::lstm(), + value, + grad, + frameSize, + /* batchSize */ 1, + activeNode_, + activeGate_, + activeState_); } } // namespace paddle diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp index a97fa6bf78fce27a4e0cf329bf3309ba4a439965..0a1e17b9aa57b373f0df6e079341729539f4e193 100644 --- a/paddle/gserver/layers/PrintLayer.cpp +++ b/paddle/gserver/layers/PrintLayer.cpp @@ -29,7 +29,7 @@ public: vals.push_back(s.str()); } size_t pos = 0; - int i = 0; + size_t i = 0; std::ostringstream s; const std::string& format = config_.user_arg(); while (true) { diff --git a/paddle/gserver/layers/RowL2NormLayer.cpp b/paddle/gserver/layers/RowL2NormLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0d609be43b73a86d0d0f7b60be993836e2ea6fff --- /dev/null +++ b/paddle/gserver/layers/RowL2NormLayer.cpp @@ -0,0 +1,98 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Layer.h" + +namespace paddle { + +/** + * A layer for L2 normalization in each row, + * \f[ + * out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}} + * \f] + * where the size of \f$in\f$ is (batchSize x dataDim), + * and the size of \f$out\f$ is (batchSize x dataDim). + */ + +class RowL2NormLayer : public Layer { +protected: + MatrixPtr inSquare_; + MatrixPtr l2NormReciprocal_; + MatrixPtr dotSum_; + +public: + explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + void backward(const UpdateCallback& callback = nullptr) override; +}; + +REGISTER_LAYER(row_l2_norm, RowL2NormLayer); + +bool RowL2NormLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + + CHECK_EQ(inputLayers_.size(), 1U); + + return true; +} + +void RowL2NormLayer::forward(PassType passType) { + Layer::forward(passType); + + MatrixPtr inV = getInputValue(0); + + /* malloc memory for the output_ if necessary */ + size_t batchSize = inV->getHeight(); + size_t dataDim = getSize(); + CHECK_EQ(dataDim, inV->getWidth()); + resetOutput(batchSize, dataDim); + MatrixPtr outV = getOutputValue(); + + Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_); + inV->square2(*inSquare_); + Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_); + inSquare_->rowSum(*l2NormReciprocal_); + l2NormReciprocal_->sqrt2(*l2NormReciprocal_); + l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0); + outV->rowScale(0, *inV, *l2NormReciprocal_); +} + +void RowL2NormLayer::backward(const UpdateCallback& callback) { + MatrixPtr inV = getInputValue(0); + MatrixPtr inG = getInputGrad(0); + MatrixPtr outV = getOutputValue(); + MatrixPtr outG = getOutputGrad(); + size_t batchSize = inV->getHeight(); + + // inG[ij] += outG[ij] / l2NormReciprocal + // inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i], + // inV[i]) + if (inG) { + Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_); + dotSum_->zeroMem(); + dotSum_->rowDotMul(0, *outG, *outV); + dotSum_->dotMul(*dotSum_, *l2NormReciprocal_); + dotSum_->dotMul(*dotSum_, *l2NormReciprocal_); + inSquare_->rowScale(0, *inV, *dotSum_); + inG->sub(*inSquare_); + inG->addRowScale(0, *outG, *l2NormReciprocal_); + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/SliceProjection.cpp b/paddle/gserver/layers/SliceProjection.cpp new file mode 100644 index 0000000000000000000000000000000000000000..267dd6154b1b21cc9b936384d438a2c3bdf0c246 --- /dev/null +++ b/paddle/gserver/layers/SliceProjection.cpp @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Projection.h" + +namespace paddle { + +/** + * SliceProjection can slice the input value into multiple parts, + * and then select some of them to merge into a new output. + * + * First, calculate the slices that need to be merged into the output. + * slices = input.slices().for_output() + * + * Second, merge each slice into the output. + * for(auto slice: slices) { + * out.addAtOffset(slice, offset); + * } + * + * Input slices as output: s0, s1, ...: + * ----------------------- + * |///| |//////| | + * |/s0| |//s1//| | + * |///| |//////| | + * ----------------------- + * Output, merge s0, s1, ... into one output: + * ---------------- + * |///|//////| | + * |/s0|//s1//|...| + * |///|//////| | + * ---------------- + * + * The config file api is slice_projection. + */ +class SliceProjection : public Projection { +public: + SliceProjection(const ProjectionConfig& config, + const ParameterPtr& parameter, + bool useGpu); + virtual void forward(); + virtual void backward(const UpdateCallback& callback); + +protected: + std::vector> slices_; +}; + +REGISTER_PROJECTION(slice, SliceProjection); + +/** + * Constructed function. + * @note SliceProjection should not have any parameter. + */ +SliceProjection::SliceProjection(const ProjectionConfig& config, + const ParameterPtr& parameter, + bool useGpu) + : Projection(config, parameter, useGpu) { + CHECK(!parameter) << "'slice' projection should not have any parameter"; + + slices_.reserve(config.slices_size()); + for (const auto& slice : config.slices()) { + slices_.push_back(std::make_pair(slice.start(), slice.end())); + } +} + +void SliceProjection::forward() { + size_t offset = 0; + for (auto& slice : slices_) { + auto slice_out = in_->value->subColMatrix(slice.first, slice.second); + out_->value->addAtOffset(*slice_out, offset); + offset += slice_out->getWidth(); + } +} + +void SliceProjection::backward(const UpdateCallback& callback) { + if (in_->grad) { + size_t offset = 0; + for (auto& slice : slices_) { + auto slice_out = in_->grad->subColMatrix(slice.first, slice.second); + slice_out->addAtOffset(*out_->grad, offset); + offset += slice_out->getWidth(); + } + } +} + +} // namespace paddle diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index a43adc7ce7db937bd62ea9bf1533b8a5899c259a..4546d12a903084e7a746b967c39d67a0ade4c0cd 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -1,5 +1,10 @@ # gserver pacakge unittests +file(GLOB_RECURSE GSERVER_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.h") +file(GLOB_RECURSE GSERVER_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cpp") +add_style_check_target(paddle_gserver ${GSERVER_SOURCES}) +add_style_check_target(paddle_gserver ${GSERVER_HEADER}) + ################### test_ProtoDataProvider ############ add_unittest_without_exec(test_ProtoDataProvider test_ProtoDataProvider.cpp) @@ -50,7 +55,7 @@ add_unittest_without_exec(test_DetectionOutput test_DetectionOutput.cpp LayerGradUtil.cpp) -add_test(NAME test_DetectionOutput +add_test(NAME test_DetectionOutput COMMAND test_DetectionOutput) ################# test_ConvUnify ####################### add_unittest_without_exec(test_ConvUnify diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index 9eca58f1a1baa6fb1c404a91a345bc7f9d6b4acc..fd9cfa1dc7a9028cb2c5c98baca98ffb2a837bac 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -400,7 +400,6 @@ void initDataLayer(TestConfig testConf, const std::vector& labelSeqStartPositions = testConf.inputDefs[i].labelSeqStartPositions; if (labelSeqStartPositions.size() != 0) { - CHECK(!sequenceStartPositions); CHECK_GE(static_cast(labelSeqStartPositions.size()), 2); sequenceStartPositions = @@ -410,6 +409,19 @@ void initDataLayer(TestConfig testConf, useGpu); data.sequenceStartPositions = sequenceStartPositions; } + + const std::vector& labelSubSeqStartPositions = + testConf.inputDefs[i].labelSubSeqStartPositions; + if (labelSubSeqStartPositions.size() != 0) { + CHECK_GE(static_cast(labelSubSeqStartPositions.size()), 2); + + subSequenceStartPositions = + ICpuGpuVector::create(labelSubSeqStartPositions.size(), useGpu); + subSequenceStartPositions->copyFrom(labelSubSeqStartPositions.data(), + labelSubSeqStartPositions.size(), + useGpu); + data.subSequenceStartPositions = subSequenceStartPositions; + } break; } default: diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h index d299b4dd09418589514d99a72f83e1103ace7de1..5debedf5ef6a3262578ca01b335e664f9a334d35 100644 --- a/paddle/gserver/tests/LayerGradUtil.h +++ b/paddle/gserver/tests/LayerGradUtil.h @@ -67,6 +67,7 @@ struct InputDef { bool isStatic; std::vector labelInitValue; std::vector labelSeqStartPositions; + std::vector labelSubSeqStartPositions; MatrixPtr selfDefinedData; InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) { @@ -81,8 +82,10 @@ struct InputDef { InputDef(InputType type, string nameIn, MatrixPtr selfDefinedData, - std::vector selfDefinedSeqStartPos = {}) + std::vector selfDefinedSeqStartPos = {}, + std::vector selfDefinedSubSeqStartPos = {}) : labelSeqStartPositions(selfDefinedSeqStartPos), + labelSubSeqStartPositions(selfDefinedSubSeqStartPos), selfDefinedData(selfDefinedData) { inputType = type; name = nameIn; diff --git a/paddle/gserver/tests/concat_slice_a.conf b/paddle/gserver/tests/concat_slice_a.conf new file mode 100644 index 0000000000000000000000000000000000000000..dccf911089e16f4f97b1470ee39d192d4557d4bd --- /dev/null +++ b/paddle/gserver/tests/concat_slice_a.conf @@ -0,0 +1,41 @@ +#edit-mode: -*- python -*- +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from paddle.trainer_config_helpers import * + +settings(batch_size=10) + +data = data_layer(name ="input", size=8*16*16) + +conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1, + num_channels=8, + num_filters=16, stride=1, + bias_attr=False, + act=ReluActivation()) +conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1, + num_channels=8, + num_filters=16, stride=1, + bias_attr=False, + act=ReluActivation()) + +proj1 = slice_projection(input=conv1, slices=[(0, 4), (4, 12)]) + +proj2 = slice_projection(input=conv2, slices=[(1, 5), (5, 15)]) + +concat = concat_layer(input=[proj1, proj2]) + +outputs(concat) + diff --git a/paddle/gserver/tests/concat_slice_b.conf b/paddle/gserver/tests/concat_slice_b.conf new file mode 100644 index 0000000000000000000000000000000000000000..29686ef2810370af3f84b60b2450d5c7d2e7663d --- /dev/null +++ b/paddle/gserver/tests/concat_slice_b.conf @@ -0,0 +1,41 @@ +#edit-mode: -*- python -*- +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from paddle.trainer_config_helpers import * + +settings(batch_size=10) + +data = data_layer(name ="input", size=8*16*16) + +conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1, + num_channels=8, + num_filters=16, stride=1, + bias_attr=False, + act=ReluActivation()) +conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1, + num_channels=8, + num_filters=16, stride=1, + bias_attr=False, + act=ReluActivation()) + +proj1 = slice_projection(input=conv1, slices=[(0, 12)]) + +proj2 = slice_projection(input=conv2, slices=[(1, 15)]) + +concat = concat_layer(input=[proj1, proj2]) + +outputs(concat) + diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp index b201ba8a5a4146ab28cd96454f434f889d72a968..de93972a5880518dfbfb9f8582e17c594e54b9b8 100644 --- a/paddle/gserver/tests/test_ActivationGrad.cpp +++ b/paddle/gserver/tests/test_ActivationGrad.cpp @@ -57,6 +57,39 @@ TEST(Activation, activation) { } } +void testSequenceSoftmaxAct(bool hasSubseq) { + LOG(INFO) << "test activation: sequence softmax"; + + const size_t size = 1; + TestConfig config; + config.biasSize = 0; + config.layerConfig.set_type("addto"); + config.layerConfig.set_size(size); + config.layerConfig.set_active_type("sequence_softmax"); + config.inputDefs.push_back( + {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA, + "layer_0", + 1, + 0}); + config.layerConfig.add_inputs(); + + for (auto useGpu : {false, true}) { + testLayerGrad(config, + "sequence_softmax", + 100, + /* trans= */ false, + useGpu, + /* useWeight */ true); + } +} + +TEST(SequenceSoftmaxActivation, activation) { + for (auto hasSubseq : {false, true}) { + LOG(INFO) << "hasSubseq = " << hasSubseq; + testSequenceSoftmaxAct(hasSubseq); + } +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); initMain(argc, argv); diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 0975c3bc9573c6ccb8f0ac98c41586d322d2465e..fe11278f41c0118ee0bdb34f17fbf9602e0fa76b 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -152,6 +152,26 @@ TEST(Projection, identity) { } } +TEST(Projection, slice) { + ProjectionConfig conf; + conf.set_type("slice"); + conf.set_input_size(100); + SliceConfig& slice1 = *conf.add_slices(); + slice1.set_start(10); + slice1.set_end(20); + SliceConfig& slice2 = *conf.add_slices(); + slice2.set_start(50); + slice2.set_end(70); + conf.set_output_size(30); + for (auto useGpu : {false, true}) { + testProjectionGrad(conf, + INPUT_DATA, + /* parameterSize */ 0, + /* batchSize */ 10, + useGpu); + } +} + TEST(Projection, scaling) { ProjectionConfig conf; conf.set_type("scaling"); @@ -1879,6 +1899,36 @@ TEST(Layer, CropLayer) { } } +TEST(Layer, ClipLayer) { + const size_t batchSize = 128; + const size_t size = 512; + TestConfig config; + config.layerConfig.set_type("clip"); + config.inputDefs.push_back({INPUT_DATA, "input", size, 0}); + LayerInputConfig* input = config.layerConfig.add_inputs(); + ClipConfig* layerConf = input->mutable_clip_conf(); + double p1 = std::rand() / (double)RAND_MAX; + double p2 = std::rand() / (double)RAND_MAX; + layerConf->set_min(std::min(p1, p2)); + layerConf->set_max(std::max(p1, p2)); + for (auto useGpu : {false, true}) { + testLayerGrad(config, "clip", batchSize, false, useGpu, false); + } +} + +TEST(Layer, RowL2NormLayer) { + const size_t batchSize = 128; + const size_t size = 512; + TestConfig config; + config.layerConfig.set_type("row_l2_norm"); + config.layerConfig.set_size(size); + config.inputDefs.push_back({INPUT_DATA, "input", size, 0}); + config.layerConfig.add_inputs(); + for (auto useGpu : {false, true}) { + testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false); + } +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); initMain(argc, argv); diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp index 40e662b22bac0a2d22aea31fe99b11695bac3f57..f930c72fde3f5e0a6a45cb6bfd3507a4f48028fc 100644 --- a/paddle/gserver/tests/test_NetworkCompare.cpp +++ b/paddle/gserver/tests/test_NetworkCompare.cpp @@ -237,6 +237,12 @@ TEST(Compare, concat_table) { compareNetwork(config_file_a, config_file_b); } +TEST(Compare, concat_slice) { + std::string config_file_a = "./gserver/tests/concat_slice_a.conf"; + std::string config_file_b = "./gserver/tests/concat_slice_b.conf"; + compareNetwork(config_file_a, config_file_b); +} + #ifndef PADDLE_ONLY_CPU TEST(Compare, img_pool) { std::string config_file_a = "./gserver/tests/img_pool_a.conf"; diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu index de48b6fac9c7d8125a552022c52353ef6bcef995..5435808fb7f70fdf1ac98815f7fe8890fb85527c 100644 --- a/paddle/math/BaseMatrix.cu +++ b/paddle/math/BaseMatrix.cu @@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include #include +#include +#include #include "BaseMatrix.h" -#include "hl_matrix_ops.cuh" -#include "hl_matrix_base.cuh" -#include "hl_matrix_apply.cuh" -#include "SIMDFunctions.h" #include "MathFunctions.h" +#include "SIMDFunctions.h" +#include "hl_matrix_apply.cuh" +#include "hl_matrix_base.cuh" +#include "hl_matrix_ops.cuh" namespace paddle { const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported."; -template +template template int BaseMatrixT::applyUnary(Op op) { MatrixOffset offset(0, 0); @@ -34,9 +34,11 @@ int BaseMatrixT::applyUnary(Op op) { return 0; } -template +template template -int BaseMatrixT::applyUnary(Op op, int numRows, int numCols, +int BaseMatrixT::applyUnary(Op op, + int numRows, + int numCols, MatrixOffset& offset) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; int dimM = numRows; @@ -56,7 +58,7 @@ int BaseMatrixT::applyUnary(Op op, int numRows, int numCols, return 0; } -template +template template int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b) { CHECK(height_ == b.height_ && width_ == b.width_) @@ -67,18 +69,23 @@ int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b) { return 0; } -template +template template -int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, - MatrixOffset& offset) { +int BaseMatrixT::applyBinary( + Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) { applyBinary(op, b, numRows, numCols, offset, false_type(), false_type()); return 0; } -template +template template -int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, - MatrixOffset& offset, bAsRowVector, bAsColVector) { +int BaseMatrixT::applyBinary(Op op, + BaseMatrixT& b, + int numRows, + int numCols, + MatrixOffset& offset, + bAsRowVector, + bAsColVector) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch"; @@ -91,8 +98,8 @@ int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, T* A = data_; T* B = b.data_; CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimN + offset.aCol_, this->width_); if (!bAsRowVector::value && !bAsColVector::value) { @@ -115,7 +122,7 @@ int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, return 0; } -template +template template int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) { CHECK_EQ(height_, b.height_); @@ -129,21 +136,29 @@ int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) { return 0; } -template +template template -int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, - int numRows, int numCols, +int BaseMatrixT::applyTernary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, + int numRows, + int numCols, MatrixOffset& offset) { applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type()); return 0; } -template +template template -int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, - int numRows, int numCols, MatrixOffset& offset, - cAsRowVector, cAsColVector) { +int BaseMatrixT::applyTernary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, + int numRows, + int numCols, + MatrixOffset& offset, + cAsRowVector, + cAsColVector) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR; @@ -160,10 +175,10 @@ int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, T* B = b.data_; T* C = c.data_; CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); - CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, - offset.cRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); + CAL_MATRIX_START_ADDRESS( + C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_); CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimN + offset.aCol_, this->width_); @@ -180,21 +195,21 @@ int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, } if (true == useGpu_) { - hl_gpu_apply_ternary_op - ( + hl_gpu_apply_ternary_op( op, A, B, C, dimM, dimN, lda, ldb, ldc); } else { - hl_cpu_apply_ternary_op - ( + hl_cpu_apply_ternary_op( op, A, B, C, dimM, dimN, lda, ldb, ldc); } return 0; } -template +template template -int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, +int BaseMatrixT::applyQuaternary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, BaseMatrixT& d) { CHECK_EQ(height_, b.height_); CHECK_EQ(width_, b.width_); @@ -209,10 +224,14 @@ int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, return 0; } -template +template template -int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, - BaseMatrixT& d, int numRows, int numCols, +int BaseMatrixT::applyQuaternary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, + BaseMatrixT& d, + int numRows, + int numCols, MatrixOffset& offset) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; @@ -234,12 +253,12 @@ int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, T* C = c.data_; T* D = d.data_; CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); - CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, - offset.cRow_); - CAL_MATRIX_START_ADDRESS(D, d.height_, d.width_, ldd, offset.dCol_, - offset.dRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); + CAL_MATRIX_START_ADDRESS( + C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_); + CAL_MATRIX_START_ADDRESS( + D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_); CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimN + offset.aCol_, this->width_); @@ -250,22 +269,29 @@ int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, CHECK_LE(dimM + offset.dRow_, d.height_); CHECK_LE(dimN + offset.dCol_, d.width_); if (true == useGpu_) { - hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, - ldc, ldd); + hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd); } else { - hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, - ldc, ldd); + hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd); } return 0; } -template -template +template -int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, - int numRows, int numCols, MatrixOffset& offset, - aAsRowVector, aAsColVector) { +int BaseMatrixT::aggregate(Agg agg, + Op op, + Saver sv, + BaseMatrixT& b, + int numRows, + int numCols, + MatrixOffset& offset, + aAsRowVector, + aAsColVector) { CHECK_EQ(useGpu_, b.useGpu_); int ld = stride_; @@ -273,10 +299,10 @@ int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, T* dst = data_; T* B = b.data_; - CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_, - offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); + CAL_MATRIX_START_ADDRESS( + dst, height_, width_, ld, offset.aCol_, offset.aRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); if (aAsRowVector::value && !aAsColVector::value) { if (useGpu_) { @@ -297,12 +323,21 @@ int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, return 0; } -template -template +template -int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, - BaseMatrixT& c, int numRows, int numCols, - MatrixOffset& offset, aAsRowVector, +int BaseMatrixT::aggregate(Agg agg, + Op op, + Saver sv, + BaseMatrixT& b, + BaseMatrixT& c, + int numRows, + int numCols, + MatrixOffset& offset, + aAsRowVector, aAsColVector) { CHECK_EQ(useGpu_, b.useGpu_); CHECK_EQ(useGpu_, c.useGpu_); @@ -314,28 +349,28 @@ int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, T* dst = data_; T* B = b.data_; T* C = c.data_; - CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_, - offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); - CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, - offset.cRow_); + CAL_MATRIX_START_ADDRESS( + dst, height_, width_, ld, offset.aCol_, offset.aRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); + CAL_MATRIX_START_ADDRESS( + C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_); if (aAsRowVector::value && !aAsColVector::value) { if (useGpu_) { - hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, - ldb, C, ldc); + hl_gpu_matrix_column_op( + agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc); } else { - hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, - ldb, C, ldc); + hl_cpu_matrix_column_op( + agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc); } } else if (!aAsRowVector::value && aAsColVector::value) { if (useGpu_) { - hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, - ldb, C, ldc); + hl_gpu_matrix_row_op( + agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc); } else { - hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, - ldb, C, ldc); + hl_cpu_matrix_row_op( + agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc); } } else { LOG(FATAL) << "not supported"; @@ -350,15 +385,19 @@ int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, */ DEFINE_MATRIX_UNARY_OP(Neg, a = -a); -template -void BaseMatrixT::neg() { applyUnary(unary::Neg()); } +template +void BaseMatrixT::neg() { + applyUnary(unary::Neg()); +} DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a)); -template<> -void BaseMatrixT::exp2() { applyUnary(unary::Exp()); } +template <> +void BaseMatrixT::exp2() { + applyUnary(unary::Exp()); +} DEFINE_MATRIX_UNARY_OP(Log, a = log(a)); -template<> +template <> void BaseMatrixT::log2() { if (useGpu_) { applyUnary(unary::Log()); @@ -368,30 +407,42 @@ void BaseMatrixT::log2() { } DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a)); -template<> -void BaseMatrixT::sqrt2() { applyUnary(unary::Sqrt()); } +template <> +void BaseMatrixT::sqrt2() { + applyUnary(unary::Sqrt()); +} DEFINE_MATRIX_UNARY_OP(Square, a = a * a); -template -void BaseMatrixT::square2() { applyUnary(unary::Square()); } +template +void BaseMatrixT::square2() { + applyUnary(unary::Square()); +} DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a); -template -void BaseMatrixT::reciprocal2() { applyUnary(unary::Reciprocal()); } +template +void BaseMatrixT::reciprocal2() { + applyUnary(unary::Reciprocal()); +} DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a); -template -void BaseMatrixT::abs2() { applyUnary(unary::Abs()); } +template +void BaseMatrixT::abs2() { + applyUnary(unary::Abs()); +} DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0)); -template -void BaseMatrixT::sign2() { applyUnary(unary::Sign()); } +template +void BaseMatrixT::sign2() { + applyUnary(unary::Sign()); +} DEFINE_MATRIX_UNARY_OP(Zero, a = 0); -template -void BaseMatrixT::zero() { applyUnary(unary::Zero()); } +template +void BaseMatrixT::zero() { + applyUnary(unary::Zero()); +} -template +template void BaseMatrixT::zeroAtOffset(int64_t columnOffset, int64_t numColumns) { int numRows = height_; int numCols = numColumns; @@ -400,11 +451,13 @@ void BaseMatrixT::zeroAtOffset(int64_t columnOffset, int64_t numColumns) { } DEFINE_MATRIX_UNARY_OP(One, a = 1); -template -void BaseMatrixT::one() { applyUnary(unary::One()); } +template +void BaseMatrixT::one() { + applyUnary(unary::One()); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p)); -template<> +template <> void BaseMatrixT::pow2(real p) { if (useGpu_) { applyUnary(unary::Pow(p)); @@ -414,44 +467,67 @@ void BaseMatrixT::pow2(real p) { } DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p); -template -void BaseMatrixT::subScalar(T p) { applyUnary(unary::SubScalar(p)); } +template +void BaseMatrixT::subScalar(T p) { + applyUnary(unary::SubScalar(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p); -template -void BaseMatrixT::mulScalar(T p) { applyUnary(unary::MulScalar(p)); } +template +void BaseMatrixT::mulScalar(T p) { + applyUnary(unary::MulScalar(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p); -template -void BaseMatrixT::divScalar(T p) { applyUnary(unary::DivScalar(p)); } +template +void BaseMatrixT::divScalar(T p) { + applyUnary(unary::DivScalar(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p); -template -void BaseMatrixT::assign(T p) { applyUnary(unary::Assign(p)); } +template +void BaseMatrixT::assign(T p) { + applyUnary(unary::Assign(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p); -template -void BaseMatrixT::add(T p) { applyUnary(unary::Add(p)); } +template +void BaseMatrixT::add(T p) { + applyUnary(unary::Add(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2); -template -void BaseMatrixT::add(T p1, T p2) { applyUnary(unary::Add2(p1, p2)); } +template +void BaseMatrixT::add(T p1, T p2) { + applyUnary(unary::Add2(p1, p2)); +} -DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER, +DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, + TWO_PARAMETER, a = a < p1 ? p1 : (a > p2 ? p2 : a)); -template -void BaseMatrixT::clip(T p1, T p2) { applyUnary(unary::Clip(p1, p2)); } +template +void BaseMatrixT::clip(T p1, T p2) { + applyUnary(unary::Clip(p1, p2)); +} -DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, + TWO_PARAMETER, + a = b < p1 ? 0 : (b > p2 ? 0 : 1)); +template +void BaseMatrixT::clipDerivative(BaseMatrixT& b, T p1, T p2) { + applyBinary(binary::ClipDerivative(p1, p2), b); +} + +DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, + ONE_PARAMETER, a = a > p ? 1.0f : 0.0f); -template +template void BaseMatrixT::biggerThanScalar(T p) { applyUnary(unary::BiggerThanScalar(p)); } -DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, - a = a > p ? a : p); -template +DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p); +template void BaseMatrixT::downClip(T p) { applyUnary(unary::DownClip(p)); } @@ -462,12 +538,12 @@ void BaseMatrixT::downClip(T p) { */ DEFINE_MATRIX_BINARY_OP(Add, a += b); -template +template void BaseMatrixT::add(BaseMatrixT& b) { applyBinary(binary::Add(), b); } -template<> +template <> void BaseMatrixT::add(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Add(), b); @@ -478,7 +554,7 @@ void BaseMatrixT::add(BaseMatrixT& b) { } } -template +template void BaseMatrixT::addAtOffset(BaseMatrixT& b, int64_t columnOffset) { if (columnOffset + b.width_ <= width_) { int numRows = height_; @@ -497,43 +573,53 @@ void BaseMatrixT::addAtOffset(BaseMatrixT& b, int64_t columnOffset) { } } -template +template void BaseMatrixT::addP2P(BaseMatrixT& b) { T* A = data_; T* B = b.data_; int dimM = height_; int dimN = width_; - hl_gpu_apply_binary_op, 0, 0> - (binary::Add(), A, B, dimM, dimN, dimN, dimN); + hl_gpu_apply_binary_op, 0, 0>( + binary::Add(), A, B, dimM, dimN, dimN, dimN); } -template +template void BaseMatrixT::addColVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::Add(), b, numRows, numCols, offset, false_type(), + applyBinary(binary::Add(), + b, + numRows, + numCols, + offset, + false_type(), true_type() /* bAsColVector */); } -template +template void BaseMatrixT::addRowVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::Add(), b, numRows, numCols, offset, - true_type() /* bAsRowVector */, false_type()); + applyBinary(binary::Add(), + b, + numRows, + numCols, + offset, + true_type() /* bAsRowVector */, + false_type()); } DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p); -template +template void BaseMatrixT::add(BaseMatrixT& b, T p) { applyBinary(binary::Add1(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p)); -template<> +template <> void BaseMatrixT::pow2(BaseMatrixT& b, real p) { if (useGpu_) { applyBinary(binary::Pow(p), b); @@ -543,36 +629,45 @@ void BaseMatrixT::pow2(BaseMatrixT& b, real p) { } DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b); -template +template void BaseMatrixT::add(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::Add2(p1, p2), b); } -template +template void BaseMatrixT::addBias(BaseMatrixT& b, T scale) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::Add1(scale), b, numRows, numCols, offset, - true_type() /* bAsRowVector */, false_type()); + applyBinary(binary::Add1(scale), + b, + numRows, + numCols, + offset, + true_type() /* bAsRowVector */, + false_type()); } DEFINE_MATRIX_BINARY_OP(Sub, a -= b); -template -void BaseMatrixT::sub(BaseMatrixT& b) { applyBinary(binary::Sub(), b); } +template +void BaseMatrixT::sub(BaseMatrixT& b) { + applyBinary(binary::Sub(), b); +} DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p); -template +template void BaseMatrixT::sub(BaseMatrixT& b, T p) { applyBinary(binary::Sub1(p), b); } DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f); -template -void BaseMatrixT::relu(BaseMatrixT& b) { applyBinary(binary::Relu(), b); } +template +void BaseMatrixT::relu(BaseMatrixT& b) { + applyBinary(binary::Relu(), b); +} DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f)); -template +template void BaseMatrixT::reluDerivative(BaseMatrixT& b) { applyBinary(binary::ReluDerivative(), b); } @@ -582,7 +677,7 @@ DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0; ? THRESHOLD : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))); -template<> +template <> void BaseMatrixT::softrelu(BaseMatrixT& b) { applyBinary(binary::Softrelu(), b); } @@ -592,97 +687,100 @@ DEFINE_MATRIX_BINARY_OP( a *= (1.0 - exp(-1.0 * ((b > THRESHOLD) ? THRESHOLD : ((b < -THRESHOLD) ? (-THRESHOLD) : b))))); -template<> +template <> void BaseMatrixT::softreluDerivative(BaseMatrixT& b) { applyBinary(binary::SoftreluDerivative(), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1; b = b < p2 ? b : p2); -template +template void BaseMatrixT::brelu(BaseMatrixT& b) { - int p1 = 0, p2 = 24; //! TODO(yuyang18): Make p1,p2 configuable. + int p1 = 0, p2 = 24; //! TODO(yuyang18): Make p1,p2 configuable. applyBinary(binary::Brelu(p1, p2), b); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative, TWO_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative, + TWO_PARAMETER, a *= (b > p1 && b < p2) ? 1.0 : 0.0); -template +template void BaseMatrixT::breluDerivative(BaseMatrixT& b) { int p1 = 0, p2 = 24; applyBinary(binary::BreluDerivative(p1, p2), b); } DEFINE_MATRIX_BINARY_OP(Square, b = a * a); -template +template void BaseMatrixT::square2(BaseMatrixT& b) { applyBinary(binary::Square(), b); } DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b); -template +template void BaseMatrixT::squareDerivative(BaseMatrixT& b) { applyBinary(binary::SquareDerivative(), b); } -DEFINE_MATRIX_BINARY_OP(Tanh, - T tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - b = 2.0 / (1.0 + std::exp(tmp)) - 1.0); -template<> +DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + b = 2.0 / (1.0 + std::exp(tmp)) - 1.0); +template <> void BaseMatrixT::tanh(BaseMatrixT& b) { applyBinary(binary::Tanh(), b); } DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b); -template +template void BaseMatrixT::tanhDerivative(BaseMatrixT& b) { applyBinary(binary::TanhDerivative(), b); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanh, TWO_PARAMETER, - b = p1 * - (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)); -template<> +DEFINE_MATRIX_BINARY_PARAMETER_OP( + ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)); +template <> void BaseMatrixT::scaledTanh(BaseMatrixT& b, real p1, real p2) { applyBinary(binary::ScaledTanh(p1, p2), b); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative, TWO_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative, + TWO_PARAMETER, a *= p2 * (p1 - b * b)); -template +template void BaseMatrixT::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::ScaledTanhDerivative(p1 * p1, p2 / p1), b); } DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a); -template +template void BaseMatrixT::reciprocal2(BaseMatrixT& b) { applyBinary(binary::Reciprocal(), b); } DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b); -template +template void BaseMatrixT::reciprocalDerivative(BaseMatrixT& b) { applyBinary(binary::ReciprocalDerivative(), b); } DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a); -template -void BaseMatrixT::abs2(BaseMatrixT& b) { applyBinary(binary::Abs(), b); } +template +void BaseMatrixT::abs2(BaseMatrixT& b) { + applyBinary(binary::Abs(), b); +} DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0); -template +template void BaseMatrixT::absDerivative(BaseMatrixT& b) { applyBinary(binary::AbsDerivative(), b); } -DEFINE_MATRIX_BINARY_OP( - Sigmoid, const T THRESHOLD_MIN = -40.0; const T THRESHOLD_MAX = 13.0; - T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN - : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a); - b = 1.0f / (1.0f + exp(-tmp))); -template<> +DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0; + const T THRESHOLD_MAX = 13.0; + T tmp = (a < THRESHOLD_MIN) + ? THRESHOLD_MIN + : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a); + b = 1.0f / (1.0f + exp(-tmp))); +template <> void BaseMatrixT::sigmoid(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Sigmoid(), b); @@ -716,31 +814,31 @@ void BaseMatrixT::sigmoid(BaseMatrixT& b) { } DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b)); -template +template void BaseMatrixT::sigmoidDerivative(BaseMatrixT& b) { applyBinary(binary::SigmoidDerivative(), b); } DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b); -template +template void BaseMatrixT::expDerivative(BaseMatrixT& b) { applyBinary(binary::ExpDerivative(), b); } DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f); -template +template void BaseMatrixT::sign2(BaseMatrixT& b) { applyBinary(binary::Sign(), b); } DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b)); -template<> +template <> void BaseMatrixT::exp2(BaseMatrixT& b) { applyBinary(binary::Exp(), b); } DEFINE_MATRIX_BINARY_OP(Log, a = log(b)); -template<> +template <> void BaseMatrixT::log2(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Log(), b); @@ -750,13 +848,13 @@ void BaseMatrixT::log2(BaseMatrixT& b) { } DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b)); -template<> +template <> void BaseMatrixT::sqrt2(BaseMatrixT& b) { applyBinary(binary::Sqrt(), b); } DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b)); -template<> +template <> void BaseMatrixT::invSqrt(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::InvSqrt(), b); @@ -768,37 +866,37 @@ void BaseMatrixT::invSqrt(BaseMatrixT& b) { } DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p)); -template +template void BaseMatrixT::isEqualTo(BaseMatrixT& b, T value) { applyBinary(binary::IsEqual(value), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p); -template +template void BaseMatrixT::addScalar(BaseMatrixT& b, T p) { applyBinary(binary::AddScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p); -template +template void BaseMatrixT::subScalar(BaseMatrixT& b, T p) { applyBinary(binary::SubScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p); -template +template void BaseMatrixT::mulScalar(BaseMatrixT& b, T p) { applyBinary(binary::MulScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p); -template +template void BaseMatrixT::divScalar(BaseMatrixT& b, T p) { applyBinary(binary::DivScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b); -template +template void BaseMatrixT::scalarDiv(BaseMatrixT& b, T p) { applyBinary(binary::ScalarDiv(p), b); } @@ -810,20 +908,20 @@ void BaseMatrixT::scalarDiv(BaseMatrixT& b, T p) { DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy, a = -c * log(b) - (1 - c) * log(1 - b)); -template<> +template <> void BaseMatrixT::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::SoftCrossEntropy(), b, c); } DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b))); -template +template void BaseMatrixT::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::SoftCrossEntropyBp(), b, c); } DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy, a = c > 0.5 ? -log(b) : -log(1.0 - b)); -template<> +template <> void BaseMatrixT::binaryLabelCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) { if (useGpu_) { @@ -851,70 +949,73 @@ void BaseMatrixT::binaryLabelCrossEntropy(BaseMatrixT& b, DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp, a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)); -template +template void BaseMatrixT::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::BinaryCrossEntropyBp(), b, c); } DEFINE_MATRIX_TERNARY_OP(Add, a = b + c); -template +template void BaseMatrixT::add(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Add(), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c); -template +template void BaseMatrixT::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) { applyTernary(ternary::Add1(p1, p2), b, c); } DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c); -template +template void BaseMatrixT::sub(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Sub(), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c); -template +template void BaseMatrixT::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) { applyTernary(ternary::Sub1(p1, p2), b, c); } DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c); -template +template void BaseMatrixT::add2(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Add2(), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3, THREE_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3, + THREE_PARAMETER, a = p1 * a + p2 * b + p3 * c); -template +template void BaseMatrixT::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) { applyTernary(ternary::Add3(p1, p2, p3), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate, + THREE_PARAMETER, c = p2 * c - p1 * (b + p3 * a); a = a + c); -template +template void BaseMatrixT::sgdUpdate(BaseMatrixT& b, // grad BaseMatrixT& c, // mom - T p1, // learningRate, - T p2, // momentum, - T p3) { // decayRate + T p1, // learningRate, + T p2, // momentum, + T p3) { // decayRate applyTernary(ternary::SgdUpdate(p1, p2, p3), b, c); } -DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER, +DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate, + THREE_PARAMETER, c = p2 * c - p1 * d * (b + p3 * a); a += c); -template +template void BaseMatrixT::sgdUpdate(BaseMatrixT& b, // grad, BaseMatrixT& c, // mom, BaseMatrixT& d, // lr, - T p1, // learningRate, - T p2, // momentum, - T p3) { // decayRate + T p1, // learningRate, + T p2, // momentum, + T p3) { // decayRate applyQuaternary(quaternary::SgdUpdate(p1, p2, p3), b, c, d); } @@ -922,19 +1023,22 @@ DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b; a = (a > lambda) ? (a - lambda) : (a < -lambda) ? (a + lambda) : 0); -template +template void BaseMatrixT::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) { applyBinary(binary::ApplyL1(learningRate * decayRate), lr); } -template<> +template <> void BaseMatrixT::applyL1(BaseMatrixT& lr, real learningRate, real decayRate) { if (useGpu_) { applyBinary(binary::ApplyL1(learningRate * decayRate), lr); } else { - simd::decayL1(this->data_, this->data_, lr.data_, learningRate * decayRate, + simd::decayL1(this->data_, + this->data_, + lr.data_, + learningRate * decayRate, height_ * width_); } } @@ -943,24 +1047,25 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p; a = (a > lambda) ? (a - lambda) : (a < -lambda) ? (a + lambda) : 0); -template +template void BaseMatrixT::applyL1(T learningRate, T decayRate) { applyUnary(unary::ApplyL1(learningRate * decayRate)); } -template<> +template <> void BaseMatrixT::applyL1(real learningRate, real decayRate) { if (useGpu_) { applyUnary(unary::ApplyL1(learningRate * decayRate)); } else { - simd::decayL1(this->data_, this->data_, learningRate * decayRate, - height_ * width_); + simd::decayL1( + this->data_, this->data_, learningRate * decayRate, height_ * width_); } } -DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2, ONE_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2, + ONE_PARAMETER, a *= (1.0f / (1.0f + p * b))); -template +template void BaseMatrixT::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) { if (useGpu_) { applyBinary(binary::ApplyL2(learningRate * decayRate), lr); @@ -973,32 +1078,33 @@ void BaseMatrixT::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) { } } -template +template void BaseMatrixT::applyL2(T learningRate, T decayRate) { BaseMatrixT::mulScalar(1.0f / (1.0f + learningRate * decayRate)); } DEFINE_MATRIX_BINARY_OP(DotMul, a *= b); -template +template void BaseMatrixT::dotMul(BaseMatrixT& b) { applyBinary(binary::DotMul(), b); } DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c); -template +template void BaseMatrixT::dotMul(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotMul(), b, c); } DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c); -template +template void BaseMatrixT::dotDiv(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotDiv(), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P, + TWO_PARAMETER, a = (b + p1) / (c + p2)); -template +template void BaseMatrixT::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::DotDiv2P(p1, p2), b, c); } @@ -1008,7 +1114,7 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c; ? THRESHOLD : ((a < -THRESHOLD) ? (-THRESHOLD) : a); a = log(1 + exp(a)) - a * d); -template<> +template <> void BaseMatrixT::rankLoss(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d) { @@ -1019,8 +1125,9 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c; a = (a > THRESHOLD) ? THRESHOLD : ((a < -THRESHOLD) ? (-THRESHOLD) : a); - a = exp(a); a = (a / (1 + a) - d)); -template<> + a = exp(a); + a = (a / (1 + a) - d)); +template <> void BaseMatrixT::rankLossBp(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d) { @@ -1033,7 +1140,7 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0; ? -THRESHOLD : b; a = log(1 + exp(x)) - c * x); -template<> +template <> void BaseMatrixT::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::LogisticRegressionLoss(), b, c); } @@ -1043,22 +1150,23 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0; T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD) ? -THRESHOLD : b; - x = exp(x); a = x / (1 + x) - c); -template<> + x = exp(x); + a = x / (1 + x) - c); +template <> void BaseMatrixT::logisticRegressionLossBp(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::LogisticRegressionLossBp(), b, c); } DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f); -template +template void BaseMatrixT::biggerThan(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::BiggerThan(), b, c); } DEFINE_MATRIX_QUATERNARY_OP( BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f); -template +template void BaseMatrixT::biggerThan(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d) { @@ -1066,25 +1174,34 @@ void BaseMatrixT::biggerThan(BaseMatrixT& b, } DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c); -template +template void BaseMatrixT::max2(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Max(), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError, ONE_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError, + ONE_PARAMETER, c += ((a > p) == (b > p)) ? 0.0f : 1.0f); -template -void BaseMatrixT::binaryClassificationError2(size_t destCol, BaseMatrixT& b, - BaseMatrixT& c, T p) { +template +void BaseMatrixT::binaryClassificationError2(size_t destCol, + BaseMatrixT& b, + BaseMatrixT& c, + T p) { CHECK(!useGpu_) << "do not support gpu"; MatrixOffset offset(0, 0, 0, 0, destCol, 0); int numRows = b.height_; int numCols = b.width_; - b.applyTernary(ternary::BinaryClassificationError(p), c, *this, numRows, - numCols, offset, false_type(), true_type() /*cAsColVector*/); + b.applyTernary(ternary::BinaryClassificationError(p), + c, + *this, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } -template<> +template <> void BaseMatrixT::binaryClassificationError(size_t destCol, BaseMatrixT& b, BaseMatrixT& c, @@ -1092,127 +1209,148 @@ void BaseMatrixT::binaryClassificationError(size_t destCol, MatrixOffset offset(destCol, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; - aggregate(aggregate::sum(), base::binary::classificationError(p), - base::binary::add(), b, c, numRows, numCols, offset, false_type(), + aggregate(aggregate::sum(), + base::binary::classificationError(p), + base::binary::add(), + b, + c, + numRows, + numCols, + offset, + false_type(), true_type() /*aAsColVector*/); } -DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3, THREE_PARAMETER, +DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3, + THREE_PARAMETER, a = p1 * b + p2 * c + p3 * d); -template -void BaseMatrixT::add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, - T p2, T p3) { +template +void BaseMatrixT::add3( + BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) { applyQuaternary(quaternary::Add3(p1, p2, p3), b, c, d); } DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c); -template +template void BaseMatrixT::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotMulSquare(), b, c); } DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c); -template +template void BaseMatrixT::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotSquareSquare(), b, c); } DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b); -template +template void BaseMatrixT::dotMulSquare(BaseMatrixT& b) { applyBinary(binary::DotMulSquare(), b); } DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b); -template +template void BaseMatrixT::dotSquareMul(BaseMatrixT& b) { applyBinary(binary::DotSquareMul(), b); } -DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum, THREE_PARAMETER, +DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum, + THREE_PARAMETER, T tmp = p1 * b + p2 * c + p3 * d; a += tmp * tmp); -template -void BaseMatrixT::addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, - T p1, T p2, T p3) { +template +void BaseMatrixT::addSquareSum( + BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) { applyQuaternary(quaternary::AddSquareSum(p1, p2, p3), b, c, d); } DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b); -template +template void BaseMatrixT::addSquare(BaseMatrixT& b, T p) { applyBinary(binary::AddSquare(p), b); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare, TWO_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare, + TWO_PARAMETER, a = p1 * a + p2 * b * b); -template +template void BaseMatrixT::decayAddSquare(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::DecayAddSquare(p1, p2), b); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul, + TWO_PARAMETER, a = p1 * a + p2 * b * b * c * c); -template -void BaseMatrixT::decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1, +template +void BaseMatrixT::decayAddSquareMul(BaseMatrixT& b, + BaseMatrixT& c, + T p1, T p2) { applyTernary(ternary::DecayAddSquareMul(p1, p2), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum, THREE_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum, + THREE_PARAMETER, a = 1 / (p1 * b + p2 * c + p3)); -template -void BaseMatrixT::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, - T p3) { +template +void BaseMatrixT::reciprocalSum( + BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) { applyTernary(ternary::ReciprocalSum(p1, p2, p3), b, c); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, + TWO_PARAMETER, a = 1 / (p1 * b + p2)); -template +template void BaseMatrixT::reciprocal2(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::Reciprocal2(p1, p2), b); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum, + TWO_PARAMETER, T tmp = p1 * b + p2 * c; a *= tmp * tmp); -template -void BaseMatrixT::dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, +template +void BaseMatrixT::dotMulSquareSum(BaseMatrixT& b, + BaseMatrixT& c, + T p1, T p2) { applyTernary(ternary::DotMulSquareSum(p1, p2), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum, + TWO_PARAMETER, T tmp = p1 * b + p2 * c; a = tmp * tmp); -template +template void BaseMatrixT::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::DotSquareSum(p1, p2), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum, + TWO_PARAMETER, a *= p1 * b + p2 * c); -template +template void BaseMatrixT::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::DotMulSum(p1, p2), b, c); } DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0); -template +template void BaseMatrixT::copyAndClear(BaseMatrixT& b) { applyBinary(binary::CopyAndClear(), b); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul, + TWO_PARAMETER, a = p1 * a + p2 * b * c); -template +template void BaseMatrixT::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::AddDotMul(p1, p2), b, c); } DEFINE_MATRIX_BINARY_OP(Assign, a = b;); -template +template void BaseMatrixT::assign(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Assign(), b); @@ -1223,7 +1361,7 @@ void BaseMatrixT::assign(BaseMatrixT& b) { } } -template +template void BaseMatrixT::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) { if (columnOffset + b.width_ <= width_) { int numRows = height_; @@ -1243,24 +1381,31 @@ void BaseMatrixT::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) { } DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp); -template +template void BaseMatrixT::deepSwap(BaseMatrixT& b) { - applyBinary(binary::DeepSwap(), b); + applyBinary(binary::DeepSwap(), b); } -template<> +template <> void BaseMatrixT::rowDotMul(size_t destCol, BaseMatrixT& b, BaseMatrixT& c) { int numRows = b.height_; int numCols = b.width_; MatrixOffset offset(destCol, 0, 0, 0, 0, 0); - aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c, - numRows, numCols, offset, false_type(), + aggregate(aggregate::sum(), + base::binary::mul(), + base::binary::add(), + b, + c, + numRows, + numCols, + offset, + false_type(), true_type() /*aAsColVector*/); } -template +template void BaseMatrixT::rowDotMul2(size_t destCol, BaseMatrixT& b, BaseMatrixT& c) { @@ -1283,17 +1428,24 @@ void BaseMatrixT::rowDotMul2(size_t destCol, } } -template<> +template <> void BaseMatrixT::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; - aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c, - numRows, numCols, offset, true_type() /*aAsRowVector*/, + aggregate(aggregate::sum(), + base::binary::mul(), + base::binary::add(), + b, + c, + numRows, + numCols, + offset, + true_type() /*aAsRowVector*/, false_type()); } -template +template void BaseMatrixT::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) { CHECK(!useGpu_) << "do not support gpu"; @@ -1314,16 +1466,22 @@ void BaseMatrixT::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) { } DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c); -template +template void BaseMatrixT::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::addDotMulMMV(), b, c, numRows, numCols, offset, - true_type() /*cAsRowVector*/, false_type()); + applyTernary(ternary::addDotMulMMV(), + b, + c, + numRows, + numCols, + offset, + true_type() /*cAsRowVector*/, + false_type()); } -template +template void BaseMatrixT::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) { CHECK(!useGpu_) << "do not support gpu"; @@ -1343,16 +1501,22 @@ void BaseMatrixT::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) { } } -template +template void BaseMatrixT::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::DotMul(), b, c, numRows, numCols, offset, - false_type(), true_type() /*cAsColVector*/); + applyTernary(ternary::DotMul(), + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } -template +template void BaseMatrixT::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { CHECK(!useGpu_) << "do not support gpu"; @@ -1372,52 +1536,82 @@ void BaseMatrixT::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { } } -template +template void BaseMatrixT::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, cRow); int numRows = height_; int numCols = width_; - applyTernary(ternary::DotMul(), b, c, numRows, numCols, offset, - true_type() /* cAsRowVector */, false_type() /* cAsColVector */); + applyTernary(ternary::DotMul(), + b, + c, + numRows, + numCols, + offset, + true_type() /* cAsRowVector */, + false_type() /* cAsColVector */); } -template +template void BaseMatrixT::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, cRow); int numRows = height_; int numCols = width_; - applyTernary(ternary::addDotMulMMV(), b, c, numRows, numCols, offset, - true_type() /* cAsRowVector */, false_type() /* cAsColVector */); + applyTernary(ternary::addDotMulMMV(), + b, + c, + numRows, + numCols, + offset, + true_type() /* cAsRowVector */, + false_type() /* cAsColVector */); } -template +template void BaseMatrixT::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::addDotMulMMV(), b, c, numRows, numCols, offset, - false_type(), true_type() /*cAsColVector*/); + applyTernary(ternary::addDotMulMMV(), + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c); -template +template void BaseMatrixT::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::RowAdd(p), b, c, numRows, numCols, offset, - false_type(), true_type() /*cAsColVector*/); + applyTernary(ternary::RowAdd(p), + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c)); -template<> +template <> void BaseMatrixT::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { if (useGpu_) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::RowPow(), b, c, numRows, numCols, offset, - false_type(), true_type() /*cAsColVector*/); + applyTernary(ternary::RowPow(), + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } else { size_t height = this->height_; size_t width = this->width_; @@ -1434,44 +1628,64 @@ void BaseMatrixT::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { } } -template +template void BaseMatrixT::mulRowVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::DotMul(), b, numRows, numCols, offset, - true_type() /* bAsRowVector */, false_type()); + applyBinary(binary::DotMul(), + b, + numRows, + numCols, + offset, + true_type() /* bAsRowVector */, + false_type()); } DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b); -template +template void BaseMatrixT::divRowVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::DotDiv(), b, numRows, numCols, offset, - true_type() /* bAsRowVector */, false_type()); + applyBinary(binary::DotDiv(), + b, + numRows, + numCols, + offset, + true_type() /* bAsRowVector */, + false_type()); } -template +template void BaseMatrixT::mulColVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::DotMul(), b, numRows, numCols, offset, - false_type(), true_type() /* bAsColVector */); + applyBinary(binary::DotMul(), + b, + numRows, + numCols, + offset, + false_type(), + true_type() /* bAsColVector */); } -template +template void BaseMatrixT::divColVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::DotDiv(), b, numRows, numCols, offset, - false_type(), true_type() /* bAsColVector */); + applyBinary(binary::DotDiv(), + b, + numRows, + numCols, + offset, + false_type(), + true_type() /* bAsColVector */); } -template<> +template <> template int BaseMatrixT::applyRow(Agg agg, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); @@ -1479,13 +1693,20 @@ int BaseMatrixT::applyRow(Agg agg, BaseMatrixT& b) { size_t numCols = b.width_; CHECK_EQ(height_, numRows); CHECK_EQ(width_, 1UL); - aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows, - numCols, offset, false_type(), true_type() /*aAsColVector*/); + aggregate(agg, + base::unary::identity(), + base::binary::second(), + b, + numRows, + numCols, + offset, + false_type(), + true_type() /*aAsColVector*/); return 0; } -template<> +template <> template int BaseMatrixT::applyRow(Agg agg, Saver sv, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); @@ -1493,16 +1714,25 @@ int BaseMatrixT::applyRow(Agg agg, Saver sv, BaseMatrixT& b) { size_t numCols = b.width_; CHECK_EQ(height_, numRows); CHECK_EQ(width_, 1UL); - aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset, - false_type(), true_type() /*aAsColVector*/); + aggregate(agg, + base::unary::identity(), + sv, + b, + numRows, + numCols, + offset, + false_type(), + true_type() /*aAsColVector*/); return 0; } -template<> +template <> template -int BaseMatrixT::applyRow( - Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) { +int BaseMatrixT::applyRow(Agg agg, + real scaleDest, + real scaleAgg, + BaseMatrixT& b) { if (scaleDest != 0) { applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b); } else { @@ -1514,10 +1744,10 @@ int BaseMatrixT::applyRow( return 0; } -template<> +template <> template -int BaseMatrixT::applyRow(Agg agg, Op op, Saver sv, - BaseMatrixT& b, BaseMatrixT& c) { +int BaseMatrixT::applyRow( + Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, 0); size_t numRows = b.height_; size_t numCols = b.width_; @@ -1525,16 +1755,27 @@ int BaseMatrixT::applyRow(Agg agg, Op op, Saver sv, CHECK_EQ(width_, 1UL); CHECK_EQ(c.height_, numRows); CHECK_EQ(c.width_, numCols); - aggregate(agg, op, sv, - b, c, numRows, numCols, offset, - false_type(), true_type() /*aAsColVector*/); + aggregate(agg, + op, + sv, + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*aAsColVector*/); return 0; } -template<> +template <> template -int BaseMatrixT::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg, - BaseMatrixT& b, BaseMatrixT& c) { +int BaseMatrixT::applyRow(Agg agg, + Op op, + real scaleDest, + real scaleAgg, + BaseMatrixT& b, + BaseMatrixT& c) { if (scaleDest != 0) { applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c); } else { @@ -1546,7 +1787,7 @@ int BaseMatrixT::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg, return 0; } -template<> +template <> template int BaseMatrixT::applyCol(Agg agg, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); @@ -1554,13 +1795,20 @@ int BaseMatrixT::applyCol(Agg agg, BaseMatrixT& b) { size_t numCols = b.width_; CHECK_EQ(width_, numCols); CHECK_EQ(height_, 1UL); - aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows, - numCols, offset, true_type() /*aAsRowVector*/, false_type()); + aggregate(agg, + base::unary::identity(), + base::binary::second(), + b, + numRows, + numCols, + offset, + true_type() /*aAsRowVector*/, + false_type()); return 0; } -template<> +template <> template int BaseMatrixT::applyCol(Agg agg, Saver sv, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); @@ -1568,16 +1816,25 @@ int BaseMatrixT::applyCol(Agg agg, Saver sv, BaseMatrixT& b) { size_t numCols = b.width_; CHECK_EQ(width_, numCols); CHECK_EQ(height_, 1UL); - aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset, - true_type() /*aAsRowVector*/, false_type()); + aggregate(agg, + base::unary::identity(), + sv, + b, + numRows, + numCols, + offset, + true_type() /*aAsRowVector*/, + false_type()); return 0; } -template<> +template <> template -int BaseMatrixT::applyCol( - Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) { +int BaseMatrixT::applyCol(Agg agg, + real scaleDest, + real scaleAgg, + BaseMatrixT& b) { if (scaleDest != 0) { applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b); } else { @@ -1589,48 +1846,51 @@ int BaseMatrixT::applyCol( return 0; } -template<> +template <> void BaseMatrixT::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) { applyRow(aggregate::sum(), scaleDest, scaleSum, b); } -template<> +template <> void BaseMatrixT::maxRows(BaseMatrixT& b) { applyRow(aggregate::max(), b); } -template<> +template <> void BaseMatrixT::minRows(BaseMatrixT& b) { applyRow(aggregate::min(), b); } -template<> +template <> void BaseMatrixT::maxCols(BaseMatrixT& b) { applyCol(aggregate::max(), b); } -template<> +template <> void BaseMatrixT::minCols(BaseMatrixT& b) { applyCol(aggregate::min(), b); } -template<> +template <> void BaseMatrixT::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) { applyCol(aggregate::sum(), scaleDest, scaleSum, b); } -template<> -void BaseMatrixT::sumOfSquaredDiffs( - BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) { - applyRow(aggregate::sum(), base::binary::squaredDiff(), - scaleDest, scaleSum, b, c); +template <> +void BaseMatrixT::sumOfSquaredDiffs(BaseMatrixT& b, + BaseMatrixT& c, + real scaleSum, + real scaleDest) { + applyRow( + aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c); } -template<> -void BaseMatrixT::sumOfProducts( - BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) { - applyRow(aggregate::sum(), base::binary::mul(), - scaleDest, scaleSum, b, c); +template <> +void BaseMatrixT::sumOfProducts(BaseMatrixT& b, + BaseMatrixT& c, + real scaleSum, + real scaleDest) { + applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c); } template class BaseMatrixT; diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h index 120d69f718b954925438fbd2119d69f0be13b3e9..12ad2d45a0bbff182e78da6efb3c5ff4c6b59b55 100644 --- a/paddle/math/BaseMatrix.h +++ b/paddle/math/BaseMatrix.h @@ -488,6 +488,13 @@ public: */ void clip(T p1, T p2); + /** + * this = b < low ? 0 : 1 + * + * this = b > high ? 0 : 1 + */ + void clipDerivative(BaseMatrixT& b, T p1, T p2); + /** * @code * a = a > p ? 1.0f : 0.0f diff --git a/paddle/math/MathUtils.cpp b/paddle/math/MathUtils.cpp index 5bbc3e4e3725f186373072440a93f967178e0b27..980b6e138873046468f278c2f0b16938be82b81c 100644 --- a/paddle/math/MathUtils.cpp +++ b/paddle/math/MathUtils.cpp @@ -25,7 +25,7 @@ namespace paddle { */ void sparseRand( int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) { - CHECK(size_t(nnz) > size_t(1)); + CHECK(size_t(nnz) >= size_t(1)); int* cpuMajor; int* cpuMinor; CpuIVector cpuMinorVec(nnz); diff --git a/paddle/math/TrainingAlgorithmOp.cu b/paddle/math/TrainingAlgorithmOp.cu index 72ff077270382d52bfcd340cc64d9abf49d1705d..fc746b85339de596d5ddc5811a8164094c13f63f 100644 --- a/paddle/math/TrainingAlgorithmOp.cu +++ b/paddle/math/TrainingAlgorithmOp.cu @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/utils/Logging.h" #include "BaseMatrix.h" #include "TrainingAlgorithmOp.h" +#include "paddle/utils/Logging.h" #if __cplusplus > 199711L @@ -32,10 +32,10 @@ void sparseMomentumApply(BaseMatrix& value, real tau, real learningRate) { auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad); - auto expr2 = momV.lazyAssign( - momV + (tau * alpha * gamma * learningRate) * grad); - auto expr3 = value.lazyAssign( - (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV); + auto expr2 = + momV.lazyAssign(momV + (tau * alpha * gamma * learningRate) * grad); + auto expr3 = value.lazyAssign((tau / beta + (real)1 / alpha) * momU + + ((real)1 / beta) * momV); AssignEvaluate(expr1, expr2, expr3); } @@ -52,12 +52,12 @@ void adadeltaApply(BaseMatrix& value, real momentum, real decayRate) { auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square()); - auto expr2 = lr.lazyAssign( - ((accum_update + epsilon) / (accum + epsilon)).sqrt()); - auto expr3 = accum_update.lazyAssign( - rou * accum_update + ((real)1 - rou) * (grad * lr).square()); - auto expr4 = mom.lazyAssign( - mom * momentum - learningRate * lr * (grad + value * decayRate)); + auto expr2 = + lr.lazyAssign(((accum_update + epsilon) / (accum + epsilon)).sqrt()); + auto expr3 = accum_update.lazyAssign(rou * accum_update + + ((real)1 - rou) * (grad * lr).square()); + auto expr4 = mom.lazyAssign(mom * momentum - + learningRate * lr * (grad + value * decayRate)); auto expr5 = value.lazyAssign(value + mom); AssignEvaluate(expr1, expr2, expr3, expr4, expr5); @@ -74,10 +74,10 @@ void adagradApply(BaseMatrix& value, real momentum, real decayRate) { auto expr1 = accum.lazyAssign(accum + grad.square()); - auto expr2 = lr.lazyAssign( - (accum_buffer + accum + epsilon).sqrt().reciprocal()); - auto expr3 = mom.lazyAssign( - mom * momentum - learningRate * lr * (grad + value * decayRate)); + auto expr2 = + lr.lazyAssign((accum_buffer + accum + epsilon).sqrt().reciprocal()); + auto expr3 = mom.lazyAssign(mom * momentum - + learningRate * lr * (grad + value * decayRate)); auto expr4 = value.lazyAssign(value + mom); AssignEvaluate(expr1, expr2, expr3, expr4); @@ -98,8 +98,8 @@ void rmspropApply(BaseMatrix& value, bool firstTime) { auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad); auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal()); - auto expr4 = mom.lazyAssign( - mom * momentum - learningRate * lr * (grad + value * decayRate)); + auto expr4 = mom.lazyAssign(mom * momentum - + learningRate * lr * (grad + value * decayRate)); auto expr5 = value.lazyAssign(value + mom); if (firstTime) { @@ -107,8 +107,8 @@ void rmspropApply(BaseMatrix& value, AssignEvaluate(expr1, expr2, expr3, expr4, expr5); } else { - auto expr1 = g.lazyAssign( - accumulatedRou * g + ((real)1 - rou) * grad.square()); + auto expr1 = + g.lazyAssign(accumulatedRou * g + ((real)1 - rou) * grad.square()); AssignEvaluate(expr1, expr2, expr3, expr4, expr5); } @@ -127,8 +127,8 @@ void decayedAdagradApply(BaseMatrix& value, real decayRate, bool firstTime) { auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal()); - auto expr3 = mom.lazyAssign( - mom * momentum - learningRate * lr * (grad + value * decayRate)); + auto expr3 = mom.lazyAssign(mom * momentum - + learningRate * lr * (grad + value * decayRate)); auto expr4 = value.lazyAssign(value + mom); if (firstTime) { @@ -136,8 +136,8 @@ void decayedAdagradApply(BaseMatrix& value, AssignEvaluate(expr1, expr2, expr3, expr4); } else { - auto expr1 = accum.lazyAssign( - accumulatedRou * accum + ((real)1 - rou) * grad.square()); + auto expr1 = accum.lazyAssign(accumulatedRou * accum + + ((real)1 - rou) * grad.square()); AssignEvaluate(expr1, expr2, expr3, expr4); } @@ -153,13 +153,12 @@ void adamApply(BaseMatrix& value, real beta2_power, real epsilon, real learningRate) { - real alpha = learningRate * - std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); + real alpha = + learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad); auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square()); - auto expr3 = value.lazyAssign( - value - (mom * alpha) / (v.sqrt() + epsilon)); + auto expr3 = value.lazyAssign(value - (mom * alpha) / (v.sqrt() + epsilon)); AssignEvaluate(expr1, expr2, expr3); } @@ -173,10 +172,10 @@ void adamaxApply(BaseMatrix& value, int64_t step, real alpha) { auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad); - auto expr2 = u.lazyAssign( - (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs())); + auto expr2 = + u.lazyAssign((beta2 * u > grad.abs()).condition(beta2 * u, grad.abs())); auto expr3 = value.lazyAssign( - value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u)); + value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u)); AssignEvaluate(expr1, expr2, expr3); } @@ -322,8 +321,8 @@ void adamApply(BaseMatrix& value, real beta2_power, real epsilon, real learningRate) { - real alpha = learningRate * - std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); + real alpha = + learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t; mom = beta1 * mom + ((real)1 - beta1) * grad; @@ -331,7 +330,7 @@ void adamApply(BaseMatrix& value, // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2 v = beta2 * v + ((real)1 - beta2) * grad.square(); - value -= (mom * alpha) / (v.sqrt() + epsilon); + value -= (mom * alpha) / (v.sqrt() + epsilon); } void adamaxApply(BaseMatrix& value, diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu index 40e38434fa328bba8be6e1b8e509023d615899c1..31b693afa8bd50f77a8efb67769e6215dd755bd3 100644 --- a/paddle/math/tests/test_Tensor.cu +++ b/paddle/math/tests/test_Tensor.cu @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include "paddle/math/Matrix.h" #include "TensorCheck.h" +#include "paddle/math/Matrix.h" using paddle::Matrix; using paddle::CpuMatrix; @@ -26,25 +26,25 @@ using paddle::GpuIVector; using autotest::TensorCheckEqual; using autotest::TensorCheckErr; -#define INIT_UNARY(A1, A2) \ - Tensor A1(height, width); \ - Tensor A2(height, width); \ - A1.randomizeUniform(); \ - A2.copyFrom(A1) -#define INIT_BINARY(A1, A2, B) \ - INIT_UNARY(A1, A2); \ - Tensor B(height, width); \ - B.randomizeUniform() -#define INIT_TERNARY(A1, A2, B, C) \ - INIT_BINARY(A1, A2, B); \ - Tensor C(height, width); \ - C.randomizeUniform() -#define INIT_QUATERNARY(A1, A2, B, C, D) \ - INIT_TERNARY(A1, A2, B, C); \ - Tensor D(height, width); \ - D.randomizeUniform() - -template +#define INIT_UNARY(A1, A2) \ + Tensor A1(height, width); \ + Tensor A2(height, width); \ + A1.randomizeUniform(); \ + A2.copyFrom(A1) +#define INIT_BINARY(A1, A2, B) \ + INIT_UNARY(A1, A2); \ + Tensor B(height, width); \ + B.randomizeUniform() +#define INIT_TERNARY(A1, A2, B, C) \ + INIT_BINARY(A1, A2, B); \ + Tensor C(height, width); \ + C.randomizeUniform() +#define INIT_QUATERNARY(A1, A2, B, C, D) \ + INIT_TERNARY(A1, A2, B, C); \ + Tensor D(height, width); \ + D.randomizeUniform() + +template struct TestUnaryMatrix { typedef std::function UnaryFunc; @@ -59,7 +59,7 @@ struct TestUnaryMatrix { } }; -template +template struct TestBinaryMatrix { typedef std::function BinaryFunc; @@ -74,10 +74,10 @@ struct TestBinaryMatrix { } }; -template +template struct TestTernaryMatrix { - typedef std::function TernaryFunc; + typedef std::function + TernaryFunc; explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) { for (auto height : {1, 11, 73, 128, 200, 330}) { @@ -90,10 +90,11 @@ struct TestTernaryMatrix { } }; -template +template struct TestQuaternaryMatrix { typedef std::function QuaternaryFunc; + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)> + QuaternaryFunc; explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) { for (auto height : {1, 11, 73, 128, 200, 330}) { @@ -106,7 +107,7 @@ struct TestQuaternaryMatrix { } }; -template +template struct TestUnaryVectorT { typedef std::function UnaryFunc; @@ -142,11 +143,11 @@ void SetTensorValue(Matrix& matrix, real value) { } } -template +template void testTensorAddScalar(Tensor& A1, Tensor& A2) { real p1 = 2.5; real p2 = 3.0; - A1.add(p1); // a += p + A1.add(p1); // a += p A2 += p1; TensorCheckEqual(A1, A2); @@ -155,7 +156,7 @@ void testTensorAddScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorSubScalar(Tensor& A1, Tensor& A2) { real p = 2.5; A1.subScalar(p); // a -= p @@ -163,7 +164,7 @@ void testTensorSubScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorMulScalar(Tensor& A1, Tensor& A2) { real p = 2.5; A1.mulScalar(p); // a *= p @@ -177,7 +178,7 @@ void testTensorMulScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorDivScalar(Tensor& A1, Tensor& A2) { real p = 2.5; A1.divScalar(p); // a /= p @@ -185,44 +186,44 @@ void testTensorDivScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorNeg(Tensor& A1, Tensor& A2) { A1.neg(); // a = -a A2 = -A2; TensorCheckEqual(A1, A2); } -template +template void testTensorAbs(Tensor& A1, Tensor& A2) { A1.abs2(); // a = a > 0 ? a : -a A2 = A2.abs(); TensorCheckEqual(A1, A2); } -template +template void testTensorSquare(Tensor& A1, Tensor& A2) { A1.square2(); // a = a * a A2 = A2.square(); TensorCheckEqual(A1, A2); } -template +template void testTensorReciprocal(Tensor& A1, Tensor& A2) { A1.reciprocal2(); // a = 1.0f / a A2 = A2.reciprocal(); TensorCheckEqual(A1, A2); } -template +template void testTensorSign(Tensor& A1, Tensor& A2) { A1.sign2(); // a = (a > 0) - (a < 0) A2 = A2.sign(); TensorCheckEqual(A1, A2); } -template +template void testTensorAssign(Tensor& A1, Tensor& A2) { - A1.assign(1.5); // a = p + A1.assign(1.5); // a = p A2 = A2.constant(1.5); TensorCheckEqual(A1, A2); @@ -235,7 +236,7 @@ void testTensorAssign(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testUnaryBaseOp(Tensor& A1, Tensor& A2) { testTensorAddScalar(A1, A2); testTensorSubScalar(A1, A2); @@ -249,9 +250,9 @@ void testUnaryBaseOp(Tensor& A1, Tensor& A2) { testTensorAssign(A1, A2); } -template +template void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) { - A1.add(2); // a += p + A1.add(2); // a += p A2 += 2; TensorCheckEqual(A1, A2); @@ -266,46 +267,46 @@ void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) { TEST(Unary, BaseOp) { TestUnaryMatrix testCpuMatrix(testUnaryBaseOp); TestUnaryVectorT testCpuVector(testUnaryBaseOp); - TestUnaryVectorT - testCpuIVector(testUnaryBaseOpInt); + TestUnaryVectorT testCpuIVector( + testUnaryBaseOpInt); #ifndef PADDLE_ONLY_CPU TestUnaryMatrix testGpuMatrix(testUnaryBaseOp); TestUnaryVectorT testGpuVector(testUnaryBaseOp); - TestUnaryVectorT - testGpuIVector(testUnaryBaseOpInt); + TestUnaryVectorT testGpuIVector( + testUnaryBaseOpInt); #endif } -template +template void testTensorExp(Tensor& A1, Tensor& A2) { A1.exp2(); // a = exp(a) A2 = A2.exp(); TensorCheckErr(A1, A2); } -template +template void testTensorLog(Tensor& A1, Tensor& A2) { A1.log2(); // a = log(a) A2 = A2.log(); TensorCheckErr(A1, A2); } -template +template void testTensorSqrt(Tensor& A1, Tensor& A2) { A1.sqrt2(); // a = sqrt(a) A2 = A2.sqrt(); TensorCheckErr(A1, A2); } -template +template void testTensorPow(Tensor& A1, Tensor& A2) { A1.pow2(3.2); // a = pow(a, p) A2 = A2.pow(3.2); TensorCheckErr(A1, A2); } -template +template void testUnayrMathOp(Tensor& A1, Tensor& A2) { testTensorExp(A1, A2); testTensorLog(A1, A2); @@ -321,7 +322,7 @@ TEST(Unary, MathOp) { #endif } -template +template void testTensorClip(Tensor& A1, Tensor& A2) { real p1 = 0.003f; real p2 = 0.877f; @@ -331,7 +332,7 @@ void testTensorClip(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) { real p = 0.5f; A1.biggerThanScalar(p); // a = a > p ? 1.0f : 0.0f @@ -339,7 +340,7 @@ void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorapplyL1(Tensor& A1, Tensor& A2) { /** * T lambda = p; @@ -351,14 +352,15 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2) { real learningRate = 0.7f; real decayRate = 0.6f; A1.applyL1(learningRate, decayRate); - A2 = (A2 > (learningRate * decayRate)).condition( - (A2 - (learningRate * decayRate)), - (A2 < -(learningRate * decayRate)).condition( - (A2 + (learningRate * decayRate)), (real)0.0)); + A2 = (A2 > (learningRate * decayRate)) + .condition( + (A2 - (learningRate * decayRate)), + (A2 < -(learningRate * decayRate)) + .condition((A2 + (learningRate * decayRate)), (real)0.0)); TensorCheckEqual(A1, A2); } -template +template void testUnayrCompareOp(Tensor& A1, Tensor& A2) { testTensorClip(A1, A2); testTensorBiggerThanScalar(A1, A2); @@ -377,7 +379,7 @@ TEST(Unary, CompareOp) { #endif } -template +template void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) { real p1 = 2.5; real p2 = 3.2; @@ -406,7 +408,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) { real p = 2.5; A1.sub(B); // a -= b @@ -422,7 +424,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) { real p = 2.5; A1.mulScalar(B, p); // a = b * p @@ -442,7 +444,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) { real p = 2.5; A1.divScalar(B, p); // a = b / p @@ -454,28 +456,28 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) { A1.assign(B); // a = b A2 = B; TensorCheckEqual(A1, A2); } -template +template void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) { - B.square2(A1); // b = a * a + B.square2(A1); // b = a * a A2 = B.square(); TensorCheckEqual(A1, A2); } -template +template void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.squareDerivative(B); // a *= 2.0 * b A2 = A2 * (real)2.0 * B; TensorCheckEqual(A1, A2); } -template +template void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) { B.reciprocal2(A1); // b = 1.0f / a A2 = B.reciprocal(); @@ -490,33 +492,33 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) { real learningRate = 0.7f; real decayRate = 1.2f; A1.applyL2(B, learningRate, decayRate); // a *= (1.0f / (1.0f + p * b)) - A2 *= (B.constant(1.0f) + - B.constant(learningRate * decayRate) * B).reciprocal(); + A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B) + .reciprocal(); TensorCheckEqual(A1, A2); } -template +template void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.reciprocalDerivative(B); // a *= -b * b A2 *= (-B) * B; TensorCheckEqual(A1, A2); } -template +template void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) { B.sign2(A1); // b = a > 0.0f ? 1.0f : -1.0f A2 = B.sign(); TensorCheckEqual(A1, A2); } -template +template void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) { B.abs2(A1); // b = a > 0.0f ? a : -a A2 = B.abs(); TensorCheckEqual(A1, A2); } -template +template void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) { testTensorAdd(A1, A2, B); testTensorSub(A1, A2, B); @@ -539,7 +541,7 @@ TEST(Binary, BaseOp) { #endif } -template +template void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) { // a = exp(b) A1.exp2(B); @@ -547,14 +549,14 @@ void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckErr(A1, A2); } -template +template void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.expDerivative(B); // a *= b A2 *= B; TensorCheckEqual(A1, A2); } -template +template void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) { // a = log(b) A1.log2(B); @@ -562,7 +564,7 @@ void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckErr(A1, A2); } -template +template void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) { // a = sqrt(b) A1.sqrt2(B); @@ -570,7 +572,7 @@ void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckErr(A1, A2); } -template +template void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) { // a = 1.0f / sqrt(b) A1.invSqrt(B); @@ -578,14 +580,14 @@ void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckErr(A1, A2); } -template +template void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) { A1.pow2(B, 2.5f); // a = pow(b, p) A2 = B.pow(2.5f); TensorCheckErr(A1, A2); } -template +template void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) { /* * const T THRESHOLD = 40.0; @@ -597,12 +599,14 @@ void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) { real THRESHOLD = 40.0; A2 = (B.constant(1.0f) + - (B > THRESHOLD).condition( - THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)).exp()).log(); + (B > THRESHOLD) + .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)) + .exp()) + .log(); TensorCheckErr(A1, A2); } -template +template void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { /* * const T THRESHOLD = 40.0; @@ -612,14 +616,16 @@ void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { */ A1.softreluDerivative(B); real THRESHOLD = 40.0; - A2 = A2 * (B.constant(1.0f) - - (B.constant(-1.0f) * - (B > THRESHOLD).condition( - THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))).exp()); + A2 = A2 * + (B.constant(1.0f) - + (B.constant(-1.0f) * + (B > THRESHOLD) + .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))) + .exp()); TensorCheckErr(A1, A2); } -template +template void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) { /* const T THRESHOLD_MIN = -40.0; @@ -632,46 +638,47 @@ void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) { const real THRESHOLD_MIN = -40.0; const real THRESHOLD_MAX = 13.0; - auto tmp = (B < THRESHOLD_MIN).condition( - THRESHOLD_MIN, (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B)); + auto tmp = (B < THRESHOLD_MIN) + .condition(THRESHOLD_MIN, + (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B)); A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal(); TensorCheckErr(A1, A2); } -template +template void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.sigmoidDerivative(B); // a *= b * (1 - b) A2 *= B * (B.constant(1.0f) - B); TensorCheckEqual(A1, A2); } -template +template void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) { B.tanh(A1); // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0 A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f; TensorCheckErr(A1, A2); } -template +template void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.tanhDerivative(B); // a *= 1 - b * b A2 *= B.constant(1.0f) - B * B; TensorCheckEqual(A1, A2); } -template +template void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) { real p1 = 2.5; real p2 = 3.1; // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0) B.scaledTanh(A1, p1, p2); A2 = B.constant(p1) * - (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) - - (real)1.0); + (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) - + (real)1.0); TensorCheckErr(A1, A2); } -template +template void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) { real p1 = 2.5; real p2 = 3.1; @@ -681,7 +688,7 @@ void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) { testTensorTanhDerivative(A1, A2, B); testTensorScaledTanhDerivative(A1, A2, B); @@ -708,21 +715,21 @@ TEST(Binary, MathOp) { #endif } -template +template void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) { B.relu(A1); // b = a > 0.0f ? a : 0.0f A2 = (B > (real)0.0f).condition(B, (real)0.0f); TensorCheckEqual(A1, A2); } -template +template void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.reluDerivative(B); // a *= (b > 0.0f ? 1.0f : 0.0f) A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0); TensorCheckEqual(A1, A2); } -template +template void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) { /* * b = a > p1 ? a : p1 @@ -736,7 +743,7 @@ void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { SetTensorValue(B, 32.0f); /* @@ -748,15 +755,15 @@ void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.absDerivative(B); // a = (b > 0) ? a : (b < 0) ? -a : 0 - A2 = (B > (real)0.0f).condition(A2, - (B < (real)0.0f).condition(-A2, (real)0.0f)); + A2 = (B > (real)0.0f) + .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f)); TensorCheckEqual(A1, A2); } -template +template void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) { real p = 0.613; SetTensorValue(B, p); @@ -765,7 +772,7 @@ void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) { /** * T lambda = p * b; @@ -778,12 +785,13 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) { real decayRate = 0.6f; A1.applyL1(B, learningRate, decayRate); auto lambda = B.constant(learningRate * decayRate) * B; - A2 = (A2 > lambda).condition( - (A2 - lambda), (A2 < -lambda).condition((A2 + lambda), (real)0.0f)); + A2 = (A2 > lambda) + .condition((A2 - lambda), + (A2 < -lambda).condition((A2 + lambda), (real)0.0f)); TensorCheckEqual(A1, A2); } -template +template void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) { B.subScalar(0.5f); SetTensorValue(B, 0.0f); @@ -807,7 +815,7 @@ TEST(Binary, CompareOp) { #endif } -template +template void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.add(B, C); // a = b + c A2 = B + C; @@ -833,7 +841,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.sub(B, C); // a = b - c A2 = B - C; @@ -846,7 +854,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.dotMul(B, C); // a = b * c A2 = B * C; @@ -892,7 +900,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.dotDiv(B, C); // a = (b == 0.0) ? 0.0 : b / c A2 = (B == (real)0.0).condition((real)0.0, B / C); @@ -905,7 +913,7 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { real p1 = 1.5; real p2 = 2.5; @@ -915,14 +923,14 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.softCrossEntropy(B, C); // a = -c * log(b) - (1 - c) * log(1 - b) A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log(); TensorCheckErr(A1, A2); } -template +template void testTensorSoftCrossEntropyBp(Tensor& A1, Tensor& A2, Tensor& B, @@ -932,7 +940,7 @@ void testTensorSoftCrossEntropyBp(Tensor& A1, TensorCheckEqual(A1, A2); } -template +template void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { testTensorAdd(A1, A2, B, C); testTensorSub(A1, A2, B, C); @@ -952,30 +960,30 @@ TEST(Ternary, BaseOp) { #endif } -template +template void testTensorBinaryLabelCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.binaryLabelCrossEntropy(B, C); // a = c > 0.5 ? -log(b) : -log(1.0 - b) - A2 = (C > (real)0.5).condition( - -(B.log()), -((B.constant(1.0f) - B).log())); + A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log())); TensorCheckErr(A1, A2); } -template +template void testTensorBinaryLabelCrossEntropyBp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b) A1.binaryLabelCrossEntropyBp(B, C); - A2 += (C > (real)0.5).condition( - (B.constant(-1.0f) / B), (B.constant(1.0f) - B).reciprocal()); + A2 += (C > (real)0.5) + .condition((B.constant(-1.0f) / B), + (B.constant(1.0f) - B).reciprocal()); TensorCheckErr(A1, A2); } -template +template void testTensorLogisticRegressionLoss(Tensor& A1, Tensor& A2, Tensor& B, @@ -991,13 +999,14 @@ void testTensorLogisticRegressionLoss(Tensor& A1, */ A1.logisticRegressionLoss(B, C); real THRESHOLD = 40.0; - auto tmp = (B > THRESHOLD).condition( - THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); + auto tmp = + (B > THRESHOLD) + .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp; TensorCheckErr(A1, A2); } -template +template void testTensorLogisticRegressionLossBp(Tensor& A1, Tensor& A2, Tensor& B, @@ -1013,28 +1022,29 @@ void testTensorLogisticRegressionLossBp(Tensor& A1, */ A1.logisticRegressionLossBp(B, C); real THRESHOLD = 40.0; - auto tmp = (B > THRESHOLD).condition( - THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); + auto tmp = + (B > THRESHOLD) + .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); auto tmp2 = tmp.exp(); A2 = tmp2 / (C.constant(1.0) + tmp2) - C; TensorCheckErr(A1, A2); } -template +template void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.biggerThan(B, C); // a = (b > c) ? 1.0f : 0.0f A2 = (B > C).condition((real)1.0f, (real)0.0f); TensorCheckEqual(A1, A2); } -template +template void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.max2(B, C); // a = (b > c) ? b : c A2 = (B > C).condition(B, C); TensorCheckEqual(A1, A2); } -template +template void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C); testTensorBinaryLabelCrossEntropy(A1, A2, B, C); @@ -1053,12 +1063,9 @@ TEST(Ternary, CompareOp) { #endif } -template -void testQuaternaryAdd(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testQuaternaryAdd( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f); // a = p1 * b + p2 * c + p3 * d // A2 = B * 1.5f + C * 2.5f + D * 3.5f; // TensorCheckEqual(A1, A2); @@ -1084,25 +1091,19 @@ TEST(Quaternary, BaseOp) { #endif } -template -void testTensorBiggerThan(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testTensorBiggerThan( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f); A1.biggerThan(B, C, D); - A2 = ((B > C && D > (real)0.5) - || (B < C && D < (real)0.5)).condition((real)1.0, (real)0.0); + A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5)) + .condition((real)1.0, (real)0.0); TensorCheckEqual(A1, A2); } -template -void testTensorRankLoss(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testTensorRankLoss( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { /** * const T THRESHOLD = 40.0; a = b - c; * a = (a > THRESHOLD) @@ -1114,19 +1115,17 @@ void testTensorRankLoss(Tensor& A1, real THRESHOLD = 40.0; auto tmp = B - C; - auto tmp2 = (tmp > THRESHOLD).condition( - THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); + auto tmp2 = + (tmp > THRESHOLD) + .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D; TensorCheckErr(A1, A2); } -template -void testTensorRankLossBp(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testTensorRankLossBp( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { /** * const T THRESHOLD = 40.0; a = b - c; * a = (a > THRESHOLD) @@ -1137,20 +1136,18 @@ void testTensorRankLossBp(Tensor& A1, A1.rankLossBp(B, C, D); real THRESHOLD = 40.0; auto tmp = B - C; - auto tmp2 = (tmp > THRESHOLD).condition( - THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); + auto tmp2 = + (tmp > THRESHOLD) + .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); auto tmp3 = tmp2.exp(); A2 = tmp3 / (D.constant(1.0f) + tmp3) - D; TensorCheckErr(A1, A2); } -template -void testQuaternaryCompareOp(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testQuaternaryCompareOp( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { testTensorBiggerThan(A1, A2, B, C, D); testTensorRankLoss(A1, A2, B, C, D); testTensorRankLossBp(A1, A2, B, C, D); diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu index 786d863a533b58ea9856300aaa0cd8f5a10a4dd9..92afab4ff7f5ff4acc219c5ac783733340c5726a 100644 --- a/paddle/math/tests/test_lazyAssign.cu +++ b/paddle/math/tests/test_lazyAssign.cu @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "PerfUtils.h" +#include "TensorCheck.h" #include "paddle/math/Matrix.h" #include "paddle/math/TensorAssign.h" -#include "TensorCheck.h" -#include "PerfUtils.h" using paddle::BaseMatrix; using paddle::CpuMatrix; @@ -27,14 +27,28 @@ using autotest::TensorCheckErr; typedef std::function testMatrixFunc; void testMatrixCase(testMatrixFunc matrixFunc) { for (auto height : {1}) { - for (auto width : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072, - 262144, 524288, 1048576, 2097152, 4194304, 8388608}) { + for (auto width : {1, + 32, + 64, + 128, + 512, + 1024, + 4096, + 32768, + 65536, + 131072, + 262144, + 524288, + 1048576, + 2097152, + 4194304, + 8388608}) { matrixFunc(height, width); } } } -template +template void testLazyAssign(int height, int width) { Tensor A1(height, width); Tensor A2(height, width); @@ -49,40 +63,39 @@ void testLazyAssign(int height, int width) { EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;); - EXPRESSION_PERFORMANCE( - auto expr1 = A2.lazyAssign(B + C); - auto expr2 = A2.lazyAssign(A2 * D); - AssignEvaluate(expr1, expr2);); + EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C); + auto expr2 = A2.lazyAssign(A2 * D); + AssignEvaluate(expr1, expr2);); TensorCheckErr(A1, A2); } -TEST(lazyAssign, CPU) { - testMatrixCase(testLazyAssign); -} +TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign); } #ifndef PADDLE_ONLY_CPU -TEST(lazyAssign, GPU) { - testMatrixCase(testLazyAssign); -} +TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign); } #endif -template -void sgdUpdateTensor(Tensor& A, Tensor& B, Tensor& C, Tensor& D, - real p1, real p2, real p3) { +template +void sgdUpdateTensor( + Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) { C = C * p2 - D * (B + A * p3) * p1; A += C; } -void sgdUpdateLazyAssign(BaseMatrix& A, BaseMatrix& B, - BaseMatrix& C, BaseMatrix& D, - real p1, real p2, real p3) { +void sgdUpdateLazyAssign(BaseMatrix& A, + BaseMatrix& B, + BaseMatrix& C, + BaseMatrix& D, + real p1, + real p2, + real p3) { auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1); auto expr2 = A.lazyAssign(A + C); AssignEvaluate(expr1, expr2); } -template +template void testSgdUpdate(int height, int width) { Tensor A1(height, width); Tensor A2(height, width); @@ -113,16 +126,13 @@ void testSgdUpdate(int height, int width) { * a = a + c; */ // BaseMatrix API - EXPRESSION_PERFORMANCE( - A1.sgdUpdate(B, C1, D, p1, p2, p3);); + EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3);); // Tensor expression - EXPRESSION_PERFORMANCE( - sgdUpdateTensor(A2, B, C2, D, p1, p2, p3)); + EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3)); // lazyAssign - EXPRESSION_PERFORMANCE( - sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3)); + EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3)); TensorCheckErr(A1, A2); TensorCheckErr(A1, A3); @@ -130,12 +140,8 @@ void testSgdUpdate(int height, int width) { TensorCheckErr(C1, C3); } -TEST(sgdUpdate, CPU) { - testMatrixCase(testSgdUpdate); -} +TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate); } #ifndef PADDLE_ONLY_CPU -TEST(sgdUpdate, GPU) { - testMatrixCase(testSgdUpdate); -} +TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate); } #endif diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp index 354f58df39365410ff9aec2576c768e58db9e0d2..d77478f345df97b37b214b5978f51ce47c1d791c 100644 --- a/paddle/math/tests/test_matrixCompare.cpp +++ b/paddle/math/tests/test_matrixCompare.cpp @@ -79,8 +79,8 @@ void testMatrixMaxSequence(int batchSize, int inputDim) { } TEST(Matrix, maxSequence) { - for (auto batchSize : {1, 10, 128, 1000, 6000}) { - for (auto inputDim : {1, 32, 100, 512}) { + for (auto batchSize : {1, 3, 997}) { // prime numbers close to 1, 4, 1024 + for (auto inputDim : {1, 7, 131}) { // prime numbers close to 1, 8, 128 VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim; testMatrixMaxSequence(batchSize, inputDim); } @@ -240,14 +240,10 @@ TEST(Matrix, unary) { // inverse matrix testMatrixInverse(height); #else - LOG(WARNING) << "Cannot run Matrix Inverse Unit Test.\n" - << "Failed to find lapack library in current system.\n" - << "To address this issue, Please adopt one of the following " - "approaches: \n" - << "1. Simply issue `sudo apt-get install liblapacke-dev` to " - "avoid re-build source code. \n" - << "2. Install MKL/Openblas/ATLAS and re-build PaddlePaddle " - "source code."; + LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK" + << "support so we cannot test matrix inverse. To test " + << "matrix inverse, please install LAPACKE " + << "and MKL/Openblas/ATLAS, and re-build PaddlePaddle."; #endif } } @@ -341,8 +337,8 @@ void testMatrixSoftmaxBp(int height, int width) { } TEST(Matrix, softmax) { - for (auto height : {1, 11, 73, 128, 200}) { - for (auto width : {1, 32, 100, 512, 1000}) { + for (auto height : {1, 3, 131}) { // prime numbers close to 1, 4, 127 + for (auto width : {1, 17, 251}) { // prime numbers close to 1, 16, 256 VLOG(3) << " height=" << height << " width=" << width; testMatrixSoftmax(height, width); @@ -527,7 +523,7 @@ void testVectorRowFunc(int size) { } TEST(Vector, rowFunc) { - for (auto size : {1, 5, 31, 90, 150, 500, 1000, 4000}) { + for (auto size : {1, 3, 997}) { // prime numbers close to 1, 4, 1024 VLOG(3) << " size=" << size; testVectorRowFunc(size); } @@ -604,7 +600,7 @@ void testVectorIsEqual(int size) { } TEST(Vector, Equal) { - for (auto size : {1, 5, 31, 90, 150, 500, 1000, 4000}) { + for (auto size : {1, 3, 997}) { // prime numbers close to 1, 4, 1024 VLOG(3) << " size=" << size; testVectorReset(size); testVectorReset(size); @@ -635,9 +631,8 @@ void testMatrixTopK(int samples, int dim, int beamSize) { } TEST(Matrix, topK) { - for (auto samples : {1, 5, 31, 90, 150, 500}) { - for (auto dim : - {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) { + for (auto samples : {1, 17, 131}) { // prime numbers close to 1, 16, 127 + for (auto dim : {1, 3, 997}) { // prime numbers close to 1, 4, 1024 for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) { if (beamSize > dim) continue; VLOG(3) << " samples=" << samples << " beamSize=" << beamSize @@ -650,6 +645,7 @@ TEST(Matrix, topK) { void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) { int nnz = samples * dim * ratio; + if (nnz < 1) nnz = 1; // Because sparseRand in MathUtil.cpp requires this. MatrixPtr cpuSrc = std::make_shared(samples, dim, nnz); MatrixPtr gpuSrc = std::make_shared(samples, dim, nnz); MatrixPtr cpuVal = std::make_shared(samples, beamSize); @@ -683,9 +679,9 @@ void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) { } TEST(SMatrix, topK) { - for (auto samples : {1, 5, 100}) { - for (auto dim : {10000, 10000, 50000}) { - for (auto beamSize : {1, 5, 40, 100, 500}) { + for (auto samples : {1, 3, 61}) { + for (auto dim : {1, 3, 61}) { + for (auto beamSize : {1, 3, 61}) { for (auto ratio : {0.01, 0.001}) { if (beamSize > dim) continue; VLOG(3) << " samples=" << samples << " beamSize=" << beamSize @@ -806,10 +802,9 @@ void testClassificationError(int numSamples, int dim, int topkSize) { } TEST(Matrix, classificationError) { - for (auto numSamples : {1, 5, 31, 90, 150, 300}) { - for (auto dim : - {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) { - for (auto topkSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) { + for (auto numSamples : {1, 3, 31}) { + for (auto dim : {1, 3, 31}) { + for (auto topkSize : {1, 3, (int)rand() % dim + 1}) { if (topkSize > dim) continue; VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize << " dim= " << dim; @@ -1016,13 +1011,15 @@ void testAvgPoolFwdBwd(int numSamples, TensorCheckErr(*inputGrad, *inputGpuGrad); } +// TODO(yi): I noticed many such blindly combinatorial tests in this +// file. They are no help to locate defects at all. TEST(Matrix, PoolFwdBwd) { - for (auto numSamples : {5, 32}) { - for (auto channels : {1, 9, 32}) { - for (auto imgSizeH : {14, 28}) { - for (auto imgSizeW : {16, 30}) { - for (auto sizeX : {2, 5}) { - for (auto sizeY : {2, 5}) { + for (auto numSamples : {1, 3}) { + for (auto channels : {1, 3}) { + for (auto imgSizeH : {13, 17}) { + for (auto imgSizeW : {17, 19}) { + for (auto sizeX : {2, 3}) { + for (auto sizeY : {2, 3}) { for (auto sH : {1, 2}) { for (auto sW : {1, 2}) { for (auto pH : {0, (sizeY - 1) / 2}) { @@ -1128,8 +1125,8 @@ TEST(Matrix, MaxOutFwdBwd) { } TEST(CpuMatrix, copyFrom) { - const size_t height = 1000; - const size_t width = 1000; + const size_t height = 31; + const size_t width = 53; CpuMatrix cpu(height, width); GpuMatrix gpu(height, width); CpuMatrix copy(height, width); @@ -1141,4 +1138,69 @@ TEST(CpuMatrix, copyFrom) { TensorCheckEqual(cpu, copy); } +void testBatch2seqPadding(int batchSize, int inputDim) { + MatrixPtr cpuInput = std::make_shared(batchSize, inputDim); + MatrixPtr gpuInput = std::make_shared(batchSize, inputDim); + cpuInput->randomizeUniform(); + gpuInput->copyFrom(*cpuInput); + + IVectorPtr cpuSequence; + generateSequenceStartPositions(batchSize, cpuSequence); + for (int i = 0; i < int(cpuSequence->getSize()); ++i) { + (cpuSequence->getData())[i] += 1; // so no way that maxSeqLen is 0; + } + + IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true); + gpuSequence->copyFrom(*cpuSequence); + + size_t numSeq = cpuSequence->getSize() - 1; + size_t maxSeqLen = *std::max_element(cpuSequence->getData(), + cpuSequence->getData() + numSeq); + + printf("numSeq = %ld, maxSeqLen = %ld\n", numSeq, maxSeqLen); + MatrixPtr cBatch = std::make_shared(numSeq * maxSeqLen, inputDim); + MatrixPtr gBatch = std::make_shared(numSeq * maxSeqLen, inputDim); + MatrixPtr cCheck = std::make_shared(numSeq * maxSeqLen, inputDim); + + // hl_sequence2batch_copy_padding(gBatch->getData(), + // gpuInput->getData(), + // cpuSequence->getData(), + // inputDim, + // maxSeqLen, + // numSeq, + // false, + // true); + // cCheck->copyFrom(*gBatch); + + // int* seqStart = cpuSequence->getData(); + // float* batchData = cBatch->getData(); + // float* seqData = cpuInput->getData(); + // for (size_t i = 0; i < maxSeqLen; i++) { + // for (size_t j = 0; j < numSeq; j++) { + // size_t sequenceStart = seqStart[j]; + // size_t sequenceLength = seqStart[j + 1] - seqStart[j]; + // if (i < sequenceLength) { + // memcpy(batchData + (i * numSeq + j) * inputDim, + // seqData + (sequenceStart + i) * inputDim, + // inputDim * sizeof(real)); + // } else { + // memset(batchData + (i * numSeq + j) * inputDim, + // 0, + // inputDim * sizeof(real)); + // } + // } + // } + + // TensorCheckErr(*cBatch, *cCheck); +} + +TEST(Matrix, warpCTC) { + for (auto batchSize : {1, 3, 17}) { + for (auto inputDim : {1, 3, 31}) { + VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim; + testBatch2seqPadding(batchSize, inputDim); + } + } +} + #endif diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index 4fa3fb0ee5f826d2b084c0ba184c505aee3acc48..9c41378483993101a098fc4ad1068c1ef908e566 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -39,7 +39,7 @@ class BuddyAllocator { public: void* Alloc(size_t unaligned_size); - void Free(void*); + void Free(void* ptr); size_t Used(); public: diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h index ca0789779e273fb71c3d6282c0a921cda2d776cc..cf5815644284c23a1d2abc904f8c5053ce107a72 100644 --- a/paddle/memory/detail/meta_cache.h +++ b/paddle/memory/detail/meta_cache.h @@ -33,17 +33,17 @@ namespace detail { */ class MetadataCache { public: - MetadataCache(bool uses_gpu); + explicit MetadataCache(bool uses_gpu); public: /*! \brief Load the associated metadata for the specified memory block. */ - Metadata load(const MemoryBlock*); + Metadata load(const MemoryBlock* memory_block); /*! \brief Store the associated metadata for the specified memory block. */ - void store(MemoryBlock*, const Metadata&); + void store(MemoryBlock* memory_block, const Metadata& meta_data); /*! \brief Indicate that the specified metadata will no longer be used. */ - void invalidate(MemoryBlock*); + void invalidate(MemoryBlock* memory_block); public: MetadataCache(const MetadataCache&) = delete; diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index 44f567caf9c19775f17988b5142b7693b41a126d..72351b9dfa63513713463bb47a3684f0dfd84ad3 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -68,7 +68,7 @@ class PODDeleter { static_assert(std::is_pod::value, "T must be POD"); public: - PODDeleter(Place place) : place_(place) {} + explicit PODDeleter(Place place) : place_(place) {} void operator()(T* ptr) { Free(place_, static_cast(ptr)); } private: diff --git a/paddle/operators/.clang-format b/paddle/operators/.clang-format new file mode 100644 index 0000000000000000000000000000000000000000..47b8a85206ab457e2b3cb90a68b7a82a0753d327 --- /dev/null +++ b/paddle/operators/.clang-format @@ -0,0 +1,5 @@ +--- +Language: Cpp +BasedOnStyle: Google +Standard: Cpp11 +... diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 4e34ba012038dda897244b3d491a5127ee10bf45..8887dc6dbd4df911892ede67582f8048a37e9c3c 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -41,22 +41,29 @@ function(op_library TARGET) endif() endfunction() +cc_library(net_op SRCS net_op.cc DEPS op_registry) +cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) + op_library(add_op SRCS add_op.cc add_op.cu) cc_test(add_op_test SRCS add_op_test.cc DEPS add_op) +op_library(mean_op SRCS mean_op.cc mean_op.cu) +cc_test(mean_op_test SRCS mean_op_test.cc DEPS mean_op) + op_library(mul_op SRCS mul_op.cc mul_op.cu) op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc) -op_library(sigmoid_op SRCS sigmoid_op.cu sigmoid_op.cc) + +op_library(sigmoid_op SRCS sigmoid_op.cc sigmoid_op.cu) op_library(softmax_op SRCS softmax_op.cc softmax_op.cu) op_library(guassian_random_op SRCS guassain_random_op.cc guassian_random_op.cu) op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu) - -op_library(fc_op SRCS fc_op.cc DEPS mul_op rowwise_add_op sigmoid_op - softmax_op net) +op_library(fill_zeros_like_op SRCS fill_zeros_like_op.cc fill_zeros_like_op.cu) op_library(sgd_op SRCS sgd_op.cc sgd_op.cu) -op_library(recurrent_network_op SRCS recurrent_network_op.cc DEPS op_desc -tensor op_registry operator net) -cc_test(recurrent_network_op_test SRCS recurrent_network_op_test.cc DEPS -recurrent_network_op gtest mul_op add_op) +op_library(fc_op + SRCS fc_op.cc + DEPS mul_op rowwise_add_op sigmoid_op softmax_op net_op) +op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc + DEPS op_desc tensor op_registry operator net_op) +cc_test(recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op) diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc index 1424b0284372d8dfe9eb93ee251b121a48b19b0b..7fbdd84a391c7d0048fca473f7318561df50daa2 100644 --- a/paddle/operators/add_op.cc +++ b/paddle/operators/add_op.cc @@ -18,22 +18,22 @@ namespace paddle { namespace operators { class AddOp : public OperatorWithKernel { -protected: - void InferShape(const std::vector &inputs, - const std::vector &outputs) const override { - PADDLE_ENFORCE(inputs.size() == 2, "Input size of AddOp must be two"); - PADDLE_ENFORCE(outputs.size() == 1, "Output size of AddOp must be one"); - PADDLE_ENFORCE( - inputs[0] != nullptr && inputs[1] != nullptr && outputs[0] != nullptr, - "Inputs/Outputs of AddOp must all be set"); - PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(), + protected: + void InferShape(const InferShapeContext &ctx) const override { + PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of AddOp must be two"); + PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of AddOp must be one"); + PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.InputVar(1) != nullptr, + "Inputs of AddOp must all be set"); + PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr, + "Outputs of AddOp must all be set"); + PADDLE_ENFORCE(ctx.Input(0)->dims() == ctx.Input(1)->dims(), "Two input of Add Op's dimension must be same."); - outputs[0]->Resize(inputs[0]->dims()); + ctx.Output(0)->Resize(ctx.Input(0)->dims()); } }; class AddOpMaker : public OpProtoAndCheckerMaker { -public: + public: AddOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The first input of add op"); @@ -48,13 +48,8 @@ The equation is: Out = X + Y }; class AddOpGrad : public OperatorWithKernel { -protected: - void InferShape(const std::vector &inputs, - const std::vector &outputs) const override {} - std::string DebugString() const override { - LOG(INFO) << "AddOpGrad"; - return ""; - } + protected: + void InferShape(const InferShapeContext &ctx) const override {} }; } // namespace operators diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu index 79d8de6cd46e1c72b14b0554c7be7b4eee281f4c..9bd08634da96c5595d6dd702ad9afafb94632b03 100644 --- a/paddle/operators/add_op.cu +++ b/paddle/operators/add_op.cu @@ -1,3 +1,18 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU #include "paddle/framework/op_registry.h" #include "paddle/operators/add_op.h" diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h index 0c39433788e1e07e30aaadc4766028219b05bfa5..9db19a61381fdb11350276d51d3ebbf083672022 100644 --- a/paddle/operators/add_op.h +++ b/paddle/operators/add_op.h @@ -20,17 +20,21 @@ namespace operators { template class AddKernel : public OpKernel { -public: - void Compute(const KernelContext& context) const override { - auto input0 = context.Input(0)->Get(); - auto input1 = context.Input(1)->Get(); - auto output = context.Output(0)->GetMutable(); + public: + void Compute(const ExecutionContext& context) const override { + auto input0 = context.Input(0); + auto input1 = context.Input(1); + auto output = context.Output(0); output->mutable_data(context.GetPlace()); - EigenVector::Flatten(*output).device( - *(context.GetEigenDevice())) = - EigenVector::Flatten(input0) + EigenVector::Flatten(input1); + auto X = EigenVector::Flatten(*input0); + auto Y = EigenVector::Flatten(*input1); + auto Z = EigenVector::Flatten(*output); + + auto place = context.GetEigenDevice(); + + Z.device(place) = X + Y; } }; diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc index 46c88d4d1a28eeedd02eb699562244651ead6d68..b0e1b8e41a5320aa14e316a56dbfd01e43c6816b 100644 --- a/paddle/operators/cross_entropy_op.cc +++ b/paddle/operators/cross_entropy_op.cc @@ -18,26 +18,37 @@ namespace paddle { namespace operators { class OnehotCrossEntropyOp : public OperatorWithKernel { -protected: - void InferShape(const std::vector &inputs, - const std::vector &outputs) const override { - PADDLE_ENFORCE(inputs.size() == 2, + protected: + void InferShape(const InferShapeContext &ctx) const override { + PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of OnehotCrossEntropyOp must be two"); - PADDLE_ENFORCE(outputs.size() == 1, + PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of OnehotCrossEntropyOp must be one"); - PADDLE_ENFORCE(inputs[0] != nullptr && inputs[1] != nullptr, + PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.InputVar(1) != nullptr, "Inputs of OnehotCrossEntropyOp must all be set"); - PADDLE_ENFORCE(outputs[0] != nullptr, + PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr, "Outputs of OnehotCrossEntropyOp must all be set"); - PADDLE_ENFORCE(inputs[0]->dims().size() == 2, "X's dimension must be 2."); - PADDLE_ENFORCE(outputs[0]->dims().size() == 1, + PADDLE_ENFORCE(ctx.Input(0)->dims().size() == 2, + "X's dimension must be 2."); + PADDLE_ENFORCE(ctx.Output(0)->dims().size() == 1, "label's dimension must be 1."); - outputs[0]->Resize({inputs[0]->dims()[0]}); + ctx.Output(0)->Resize({ctx.Input(0)->dims()[0]}); + } +}; + +class OnehotCrossEntropyGradientOp : public OperatorWithKernel { + protected: + void InferShape(const InferShapeContext &ctx) const override { + auto X_grad = ctx.Output(framework::GradVarName("X")); + auto X = ctx.Input("X"); + + // TODO(superjom) add enforce here after helper functions ready + X_grad->Resize(X->dims()); } }; class OnehotCrossEntropyOpMaker : public OpProtoAndCheckerMaker { -public: + public: OnehotCrossEntropyOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The first input of OnehotCrossEntropyOp"); @@ -54,8 +65,11 @@ OnehotCrossEntropy Operator. } // namespace operators } // namespace paddle -REGISTER_OP(onehot_cross_entropy, - ops::OnehotCrossEntropyOp, +REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp, ops::OnehotCrossEntropyOpMaker); REGISTER_OP_CPU_KERNEL(onehot_cross_entropy, ops::OnehotCrossEntropyOpKernel); + +REGISTER_OP_CPU_KERNEL( + onehot_cross_entropy_grad, + ops::OnehotCrossEntropyGradientOpKernel); diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu index 19e4b74596a0f59edd04db830ec6f6f481373465..2f453f8379ca7ce0612fed757719acb2d2cf0ad8 100644 --- a/paddle/operators/cross_entropy_op.cu +++ b/paddle/operators/cross_entropy_op.cu @@ -1,4 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU #include "paddle/operators/cross_entropy_op.h" REGISTER_OP_GPU_KERNEL(onehot_cross_entropy, - ops::OnehotCrossEntropyOpKernel); \ No newline at end of file + ops::OnehotCrossEntropyOpKernel); diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h index 0383df46be3a3cea7dde8f1b45857e64d5a2f2d8..88d06e13469f8e6fc9e634d804c1fe0bed5e2d75 100644 --- a/paddle/operators/cross_entropy_op.h +++ b/paddle/operators/cross_entropy_op.h @@ -18,28 +18,53 @@ limitations under the License. */ namespace paddle { namespace operators { +static const float kCrossEntropyLogThreshold{1e-20}; + template class OnehotCrossEntropyOpKernel : public OpKernel { -public: - constexpr T LOG_THRESHOLD() const { return static_cast(1e-20); } - - void Compute(const KernelContext& context) const override { - auto X = context.Input(0)->Get(); - const T* X_data = X.data(); - const int* label_data = context.Input(1)->Get().data(); - auto* Y = context.Output(0)->GetMutable(); + public: + void Compute(const ExecutionContext& ctx) const override { + auto X = ctx.Input("X"); + const T* Xdata = X->data(); + const int* label_data = ctx.Input(1)->data(); + auto Y = ctx.Output("Y"); - Y->mutable_data(context.GetPlace()); + Y->mutable_data(ctx.GetPlace()); - T* Y_data = Y->data(); + T* Ydata = Y->data(); - int batch_size = X.dims()[0]; - int class_num = X.dims()[1]; + int batch_size = X->dims()[0]; + int class_num = X->dims()[1]; // Y[i] = -log(X[i][j]) for (int i = 0; i < batch_size; ++i) { - Y_data[i] = -std::log( - std::max(X_data[i * class_num + label_data[i]], LOG_THRESHOLD())); + Ydata[i] = -std::log(std::max(Xdata[i * class_num + label_data[i]], + kCrossEntropyLogThreshold)); + } + } +}; + +template +class OnehotCrossEntropyGradientOpKernel : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const override { + auto X = ctx.Input("X"); + auto dX = ctx.Output(framework::GradVarName("X")); + auto dY = ctx.Input(framework::GradVarName("Y")); + auto label = ctx.Input("label"); + + auto* dXdata = dX->template mutable_data(ctx.GetPlace()); + auto* dYdata = dY->template data(); + auto* Xdata = X->template data(); + auto* label_data = label->data(); + + const int batch_size = X->dims()[0]; + const int class_num = X->dims()[1]; + + for (int i = 0; i < batch_size; ++i) { + dXdata[i * class_num + label_data[i]] = + -dYdata[i] / std::max(Xdata[i * class_num + label_data[i]], + kCrossEntropyLogThreshold); } } }; diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc index c4a9f5937f4fa8c60989bea1726cedbb73330156..b5cf236bac6bb5abe061f7b4ad469d20e0af76a9 100644 --- a/paddle/operators/fc_op.cc +++ b/paddle/operators/fc_op.cc @@ -18,31 +18,29 @@ namespace paddle { namespace operators { class FullyConnectedOp : public NetOp { -public: + public: void Init() override { AddOp(OpRegistry::CreateOp("mul", { Input("X"), Input("W"), }, - {Output("before_act")}, - {})); + {Output("before_act")}, {})); auto b = Input("b"); - if (b != EMPTY_VAR_NAME()) { + if (b != framework::kEmptyVarName) { AddOp(OpRegistry::CreateOp("rowwise_add", {Output("before_act"), Input("b")}, - {Output("before_act")}, - {})); + {Output("before_act")}, {})); } auto activation = GetAttr("activation"); - AddOp(OpRegistry::CreateOp( - activation, {Output("before_act")}, {Output("Y")}, {})); + AddOp(OpRegistry::CreateOp(activation, {Output("before_act")}, + {Output("Y")}, {})); CompleteAddOp(false); } }; class FullyConnectedOpMaker : public OpProtoAndCheckerMaker { -public: + public: FullyConnectedOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "the input of fc operator"); @@ -50,8 +48,8 @@ public: AddInput("b", "the bias of fc operator"); AddOutput("Y", "the output of fc operator"); - AddOutput( - "before_act", "the before activation output of fc operator", true); + AddOutput("before_act", "the before activation output of fc operator") + .SetTemporary(); AddAttr("activation", "The activation key for fc layer") .SetDefault("sigmoid") .InEnum({"sigmoid", "softmax"}); diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3d37d64c5a8c288684122f3e686262399d32ed7b --- /dev/null +++ b/paddle/operators/fill_zeros_like_op.cc @@ -0,0 +1,59 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/fill_zeros_like_op.h" +#include "paddle/framework/op_registry.h" +#include "paddle/framework/tensor.h" + +namespace paddle { +namespace operators { + +class FillZerosLikeOp : public framework::OperatorWithKernel { + protected: + void InferShape(const framework::InferShapeContext &ctx) const override { + PADDLE_ENFORCE(ctx.InputSize() == 1UL, + "Input size of FillZerosLikeOp must be one."); + PADDLE_ENFORCE(ctx.OutputSize() == 1UL, + "Output size of AddOp must be one."); + PADDLE_ENFORCE(ctx.InputVar(0) != nullptr, + "Input of FillZerosLikeOp must be set."); + PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr, + "Output of FillZerosLikeOp must be set."); + ctx.Output(0)->Resize( + ctx.Input(0)->dims()); + } +}; + +class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FillZerosLikeOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Src", "The input of fill-zeros-like op."); + AddOutput("Dst", "The varibale will be filled up with zeros."); + AddComment(R"DOC( +Fill up a vriable with zeros. + +The output will have the same size with input. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +REGISTER_OP(fill_zeros_like, paddle::operators::FillZerosLikeOp, + paddle::operators::FillZerosLikeOpMaker); +REGISTER_OP_CPU_KERNEL( + fill_zeros_like, + paddle::operators::FillZerosLikeKernel); diff --git a/paddle/operators/fill_zeros_like_op.cu b/paddle/operators/fill_zeros_like_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ed1068219c8fee8c6e8809f450a9d38c8226f317 --- /dev/null +++ b/paddle/operators/fill_zeros_like_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/op_registry.h" +#include "paddle/operators/fill_zeros_like_op.h" + +REGISTER_OP_GPU_KERNEL( + fill_zeros_like, + paddle::operators::FillZerosLikeKernel); diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4bff1fbfc15af1f4d1ce9c99fe48b0b0f11b5b3f --- /dev/null +++ b/paddle/operators/fill_zeros_like_op.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "glog/logging.h" +#include "paddle/framework/eigen.h" +#include "paddle/framework/operator.h" + +namespace paddle { +namespace operators { + +template +class FillZerosLikeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* output = context.Output(0); + output->mutable_data(context.GetPlace()); + framework::EigenVector::Flatten(*output).setZero(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8a4981c7be7587a0cc5f72cabe71e05702112ac3 --- /dev/null +++ b/paddle/operators/mean_op.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/mean_op.h" + +namespace paddle { +namespace operators { + +class MeanOp : public OperatorWithKernel { + protected: + void InferShape(const InferShapeContext &ctx) const override { + PADDLE_ENFORCE(ctx.InputSize() == 1, "Input size of AddOp must be one"); + PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of AddOp must be one"); + PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.OutputVar(0) != nullptr, + "Input/Output of MeanOp must be initialized."); + ctx.Output(0)->Resize(framework::make_ddim({1})); + } +}; + +class MeanOpMaker : public OpProtoAndCheckerMaker { + public: + MeanOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of mean op"); + AddOutput("Out", "The output of mean op").IgnoreGradient(); + AddComment("Mean Operator"); + } +}; + +class MeanGradOp : public OperatorWithKernel { + protected: + void InferShape(const InferShapeContext &ctx) const override { + ctx.Output("X" + framework::kGradVarSuffix) + ->Resize(ctx.Input("X")->dims()); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker); +REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel); +REGISTER_GRADIENT_OP(mean, mean_grad, ops::MeanGradOp); +REGISTER_OP_CPU_KERNEL(mean_grad, ops::MeanGradKernel); diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..8b97b0154ccdc8c41a90f7580af829c5c8663b60 --- /dev/null +++ b/paddle/operators/mean_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/operators/mean_op.h" + +REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel); +REGISTER_OP_GPU_KERNEL(mean_grad, ops::MeanGradKernel); diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h new file mode 100644 index 0000000000000000000000000000000000000000..40a1e2d099acad90b1bbac50f62ea7c4f691c1b4 --- /dev/null +++ b/paddle/operators/mean_op.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/operators/type_alias.h" + +namespace paddle { +namespace operators { + +template +class MeanKernel : public OpKernel { + public: + void Compute(const ExecutionContext& context) const override { + auto input = context.Input(0); + auto output = context.Output(0); + + output->mutable_data(context.GetPlace()); + + auto X = EigenVector::Flatten(*input); + auto y = EigenScalar::From(*output); + auto place = context.GetEigenDevice(); + + y.device(place) = X.mean(); + } +}; + +template +class MeanGradKernel : public OpKernel { + public: + void Compute(const ExecutionContext& context) const override { + auto OG = context.Input("Out" + framework::kGradVarSuffix); + PADDLE_ENFORCE(framework::product(OG->dims()) == 1, + "Mean Gradient should be scalar"); + auto IG = context.Output("X" + framework::kGradVarSuffix); + IG->mutable_data(context.GetPlace()); + + T ig_size = (T)framework::product(IG->dims()); + + EigenVector::Flatten(*IG).device(context.GetEigenDevice()) = + EigenScalar::From(*OG) / ig_size; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/mean_op_test.cc b/paddle/operators/mean_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..375dcd50e130355c60f82b9d39d1b94fb2c911b0 --- /dev/null +++ b/paddle/operators/mean_op_test.cc @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include + +USE_OP(mean); + +TEST(MeanOp, GetOpProto) { + auto& protos = paddle::framework::OpRegistry::protos(); + auto it = protos.find("mean"); + ASSERT_NE(it, protos.end()); +} diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index 22c1b78005358a934c57d487f5b0cff133f61f0c..f41e95e9db494109925fb600ec6bbd47edf6cc74 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -18,24 +18,23 @@ namespace paddle { namespace operators { class MulOp : public OperatorWithKernel { -protected: - void InferShape(const std::vector &inputs, - const std::vector &outputs) const override { - PADDLE_ENFORCE(inputs.size() == 2, "The mul op must take two inputs"); - auto dim0 = inputs[0]->dims(); - auto dim1 = inputs[1]->dims(); + protected: + void InferShape(const InferShapeContext &ctx) const override { + PADDLE_ENFORCE(ctx.InputSize() == 2, "The mul op must take two inputs"); + auto dim0 = ctx.Input(0)->dims(); + auto dim1 = ctx.Input(1)->dims(); PADDLE_ENFORCE(dim0.size() == 2 && dim1.size() == 2, "The input of mul op must be matrix"); PADDLE_ENFORCE( dim0[1] == dim1[0], "First matrix's width must be equal with second matrix's height."); - PADDLE_ENFORCE(outputs.size() == 1, "The mul op must take one output"); - outputs[0]->Resize({dim0[0], dim1[1]}); + PADDLE_ENFORCE(ctx.OutputSize() == 1, "The mul op must take one output"); + ctx.Output(0)->Resize({dim0[0], dim1[1]}); } }; class MulOpMaker : public OpProtoAndCheckerMaker { -public: + public: MulOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The first input of mul op"); @@ -50,9 +49,8 @@ The equation is: Out = X * Y }; class MulOpGrad : public OperatorWithKernel { -protected: - void InferShape(const std::vector &inputs, - const std::vector &outputs) const override {} + protected: + void InferShape(const InferShapeContext &ctx) const override {} std::string DebugString() const override { LOG(INFO) << "MulGrad"; return ""; diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu index c27fc886ce7238a13c8ef86bce673a2b54949a9d..1dc04c4297daed7a7861a09cf6b99446c296ffa5 100644 --- a/paddle/operators/mul_op.cu +++ b/paddle/operators/mul_op.cu @@ -12,6 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ +#define EIGEN_USE_GPU #include "paddle/operators/mul_op.h" -REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); \ No newline at end of file +REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index 467975044638a3f034ceec84173e8d3fed43cc0c..7ecd6e8ac01c9efeabe9d2873da39503966ba8df 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -21,20 +21,23 @@ namespace operators { template class MulKernel : public OpKernel { -public: - void Compute(const KernelContext& context) const override { + public: + void Compute(const ExecutionContext& context) const override { Eigen::array, 1> dim_pair = { {Eigen::IndexPair(1, 0)}}; - auto input0 = context.Input(0)->Get(); - auto input1 = context.Input(1)->Get(); - auto* output = context.Output(0)->GetMutable(); + auto input0 = context.Input("X"); + auto input1 = context.Input("Y"); + auto output = context.Output(0); output->mutable_data(context.GetPlace()); - EigenMatrix::From(*output).device(*(context.GetEigenDevice())) = - EigenMatrix::From(input0).contract(EigenMatrix::From(input1), - dim_pair); + auto X = EigenMatrix::From(*input0); + auto Y = EigenMatrix::From(*input1); + auto Z = EigenMatrix::From(*output); + auto place = context.GetEigenDevice(); + + Z.device(place) = X.contract(Y, dim_pair); } }; } // namespace operators diff --git a/paddle/framework/net.cc b/paddle/operators/net_op.cc similarity index 96% rename from paddle/framework/net.cc rename to paddle/operators/net_op.cc index 2cd378c6b21303d1a24206ba3010b0d035aaa766..fbc98e09923bda7f3baee04e02df9076247bff0b 100644 --- a/paddle/framework/net.cc +++ b/paddle/operators/net_op.cc @@ -14,11 +14,11 @@ limitations under the License. */ -#include "paddle/framework/net.h" +#include "paddle/operators/net_op.h" #include "paddle/framework/op_registry.h" namespace paddle { -namespace framework { +namespace operators { void NetOp::CompleteAddOp(bool calc) { add_op_done_ = true; @@ -74,5 +74,5 @@ std::string NetOp::DebugString() const { bool NetOp::IsNetOp() const { return true; } -} // namespace framework +} // namespace operators } // namespace paddle diff --git a/paddle/framework/net.h b/paddle/operators/net_op.h similarity index 76% rename from paddle/framework/net.h rename to paddle/operators/net_op.h index 089c1355951f59d51db16d4b4bdce4282d6e5c25..6e7af7f02ae23ec65459dfd15d950a43e96fec4d 100644 --- a/paddle/framework/net.h +++ b/paddle/operators/net_op.h @@ -14,15 +14,17 @@ limitations under the License. */ #pragma once -#include -#include +#include "paddle/framework/op_desc.pb.h" #include "paddle/framework/op_proto.pb.h" #include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" #include "paddle/framework/scope.h" +#include "paddle/operators/type_alias.h" #include "paddle/platform/device_context.h" namespace paddle { -namespace framework { +namespace operators { + /** * @brief Network is also a type of Operator * @@ -37,13 +39,13 @@ namespace framework { * This is the base class of network, all the networks should implement the APIs * it defines. */ -class NetOp : public OperatorBase { +class NetOp : public framework::OperatorBase { public: /** * Infer all the operators' input and output variables' shapes, will be called * before every mini-batch */ - void InferShape(const std::shared_ptr& scope) const override { + void InferShape(const framework::Scope& scope) const override { for (auto& op : ops_) { op->InferShape(scope); } @@ -56,7 +58,7 @@ class NetOp : public OperatorBase { * scope will be used instead. If no OpContext is provicded, default context * will be used. */ - void Run(const std::shared_ptr& scope, + void Run(const framework::Scope& scope, const platform::DeviceContext& dev_ctx) const override { for (auto& op : ops_) { op->Run(scope, dev_ctx); @@ -68,9 +70,18 @@ class NetOp : public OperatorBase { */ void AddOp(const std::shared_ptr& op) { PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed"); + PADDLE_ENFORCE(op != nullptr, "Cannot Insert Null op"); ops_.push_back(op); } + void InsertOp(size_t pos, const std::shared_ptr& op) { + PADDLE_ENFORCE(!add_op_done_, + "Cannot InsertOp when this network is sealed"); + PADDLE_ENFORCE(op != nullptr, "Cannot Insert Null op"); + PADDLE_ENFORCE(pos <= ops_.size(), "Out of range"); + ops_.insert(ops_.begin() + pos, op); + } + void CompleteAddOp(bool calculate = true); std::string DebugString() const override; @@ -88,5 +99,5 @@ class NetOp : public OperatorBase { } }; -} // namespace framework +} // namespace operators } // namespace paddle diff --git a/paddle/framework/net_design.md b/paddle/operators/net_op_design.md similarity index 100% rename from paddle/framework/net_design.md rename to paddle/operators/net_op_design.md diff --git a/paddle/framework/net_op_test.cc b/paddle/operators/net_op_test.cc similarity index 63% rename from paddle/framework/net_op_test.cc rename to paddle/operators/net_op_test.cc index 8048311fe54ee1827fb5b91577478a1d30803e43..c0a345464a34329d42c7bf753ca94fd07195b8e0 100644 --- a/paddle/framework/net_op_test.cc +++ b/paddle/operators/net_op_test.cc @@ -1,31 +1,34 @@ +#include "paddle/operators/net_op.h" + #include -#include -#include -#include -USE_OP(add_two); -USE_OP(mul); -USE_OP(sigmoid); -USE_OP(softmax); +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" namespace paddle { -namespace framework { +namespace operators { static int infer_shape_cnt = 0; static int run_cnt = 0; class TestOp : public OperatorBase { public: - void InferShape( - const std::shared_ptr& scope) const override { + void InferShape(const framework::Scope& scope) const override { ++infer_shape_cnt; } - void Run(const std::shared_ptr& scope, + void Run(const framework::Scope& scope, const paddle::platform::DeviceContext& dev_ctx) const override { ++run_cnt; } }; +class EmptyOp : public OperatorBase { + public: + void InferShape(const Scope& scope) const override {} + void Run(const Scope& scope, + const platform::DeviceContext& dev_ctx) const override {} +}; + template void AssertSameVectorWithoutOrder(const std::vector& expected, const std::vector& actual) { @@ -62,7 +65,7 @@ TEST(OpKernel, all) { ASSERT_EQ(1UL, tmp_idx.size()); ASSERT_EQ("y", net->outputs_[tmp_idx[0]]); - auto scope = std::make_shared(); + Scope scope; platform::CPUDeviceContext dev_ctx; net->InferShape(scope); @@ -72,20 +75,17 @@ TEST(OpKernel, all) { ASSERT_THROW(net->AddOp(op2), paddle::platform::EnforceNotMet); } -//! TODO(yuyang18): Refine Backward Op. -// TEST(AddBackwardOp, TestGradOp) { -// auto net = std::make_shared(); -// ASSERT_NE(net, nullptr); -// net->AddOp(framework::OpRegistry::CreateOp("mul", {"X", "Y"}, {"Out"}, {})); -// net->AddOp( -// framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {"Out"}, {})); -// net->AddOp(framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {""}, -// {})); -// auto grad_ops = AddBackwardOp(net); -// for (auto& op : grad_ops->ops_) { -// op->DebugString(); -// } -//} +TEST(NetOp, insert_op) { + NetOp net; + auto op1 = std::make_shared(); + op1->inputs_ = {"x", "w1", "b1"}; + op1->outputs_ = {"y"}; + net.AddOp(op1); + net.InsertOp(0, op1); + ASSERT_EQ(2UL, net.ops_.size()); + net.InsertOp(2, op1); + ASSERT_EQ(3UL, net.ops_.size()); +} -} // namespace framework +} // namespace operators } // namespace paddle diff --git a/paddle/operators/recurrent_network_op.cc b/paddle/operators/recurrent_network_op.cc deleted file mode 100644 index 1a101d6ddf149d608dbdbe048ef43d86bacbcc16..0000000000000000000000000000000000000000 --- a/paddle/operators/recurrent_network_op.cc +++ /dev/null @@ -1,419 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/operators/recurrent_network_op.h" - -#include -#include -#include - -#include "paddle/framework/net.h" -#include "paddle/framework/op_registry.h" -#include "paddle/platform/enforce.h" - -namespace paddle { -namespace operators { - -namespace rnn { - -void SegmentInputs(std::vector>& step_scopes, - const std::vector& inlinks, - const size_t seq_len) { - PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided."); - for (size_t i = 0; i < inlinks.size(); ++i) { - Tensor* input = - step_scopes[0]->GetVariable(inlinks[i].external)->GetMutable(); - DDim dims = input->dims(); - PADDLE_ENFORCE(static_cast(dims[0]) == seq_len, - "all the inlinks must have same length"); - DDim step_dims = slice_ddim(dims, 1, dims.size()); - for (size_t j = 0; j < seq_len; j++) { - Tensor* step_input = step_scopes[j] - ->CreateVariable(inlinks[i].internal) - ->GetMutable(); - *step_input = input->Slice(j, j + 1); - step_input->Resize(step_dims); - } - } -} - -void ConcatOutputs(std::vector>& step_scopes, - const std::vector& outlinks, - const size_t seq_len) { - for (size_t i = 0; i < outlinks.size(); i++) { - Tensor* output = - step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable(); - - // TODO(qingiqng) remove following code after adding - // InferShape in RecurrentGradientOp - DDim step_dims = step_scopes[0] - ->GetVariable(outlinks[i].internal) - ->GetMutable() - ->dims(); - std::vector dims_vec = vectorize(step_dims); - dims_vec.insert(dims_vec.begin(), seq_len); - output->mutable_data(make_ddim(dims_vec), platform::CPUPlace()); - - for (size_t j = 0; j < seq_len; j++) { - Tensor* step_output = step_scopes[j] - ->GetVariable(outlinks[i].internal) - ->GetMutable(); - // TODO(luotao02) data type and platform::DeviceContext() should set - // correctly - (output->Slice(j, j + 1)) - .CopyFrom(*step_output, platform::CPUPlace()); - } - } -} - -void LinkMemories(std::vector>& scopes, - const std::vector& memories, - size_t step_id, - int offset) { - PADDLE_ENFORCE(step_id < scopes.size(), - "step [%d] is out of range of step scopes' size [%d]", - step_id, - scopes.size()); - PADDLE_ENFORCE(static_cast(step_id) + offset >= 0, - "offset [%d] must be large than -[%d]", - offset, - step_id); - PADDLE_ENFORCE(step_id + offset < scopes.size(), - "offset [%d] is out of range, it must be less than (%d - %d)", - offset, - scopes.size(), - step_id); - std::shared_ptr scope = scopes[step_id]; - std::shared_ptr linked_scope = scopes[step_id + offset]; - for (auto& attr : memories) { - auto mem = scope->CreateVariable(attr.pre_var)->GetMutable(); - // maybe share variable is better? - auto linked_mem = linked_scope->GetVariable(attr.var)->GetMutable(); - mem->ShareDataWith(*linked_mem); - - // TODO(qingqing) remove following code - // the memory of current step should be allocated in step net - auto m = scope->CreateVariable(attr.var)->GetMutable(); - // for unit test, as addOp and mulOp are null currently, if not - // mutable_data, mem.data() in output will be error. We will - // remove this line after merge the correct addOp and mulOp. - m->mutable_data(mem->dims(), platform::CPUPlace()); - } -} - -void InitArgument(const ArgumentName& name, - Argument* arg, - const OperatorBase& op) { - arg->step_net = op.Input(name.step_net); - arg->step_scopes = op.Output(name.step_scopes); - - auto inlinks = op.Inputs(name.inlinks); - auto inlink_alias = op.GetAttr>(name.inlink_alias); - PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(), - "the size of inlinks and inlink_alias don't match:%d,%d", - inlinks.size(), - inlink_alias.size()); - for (size_t i = 0; i < inlinks.size(); ++i) { - rnn::Link link; - link.external = inlinks[i]; - link.internal = inlink_alias[i]; - (arg->inlinks).push_back(link); - } - - auto outlinks = op.Outputs(name.outlinks); - auto outlink_alias = op.GetAttr>(name.outlink_alias); - PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(), - "the size of outlinks and outlink_alias don't match:%d,%d", - outlinks.size(), - outlink_alias.size()); - for (size_t i = 0; i < outlinks.size(); ++i) { - rnn::Link link; - link.external = outlinks[i]; - link.internal = outlink_alias[i]; - (arg->outlinks).push_back(link); - } - - auto boot_memories = op.Inputs(name.boot_memories); - - // attributes - auto memories = op.GetAttr>(name.memories); - auto pre_memories = op.GetAttr>(name.pre_memories); - - PADDLE_ENFORCE(memories.size() == boot_memories.size(), - "the size of memories, boot_memories don't match:%d,%d", - memories.size(), - boot_memories.size()); - PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(), - "the size of pre_memories, boot_memories don't match:%d,%d", - pre_memories.size(), - boot_memories.size()); - PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set"); - - for (size_t i = 0; i < memories.size(); ++i) { - rnn::MemoryAttr mem_attr; - mem_attr.var = memories[i]; - mem_attr.pre_var = pre_memories[i]; - mem_attr.boot_var = boot_memories[i]; - (arg->memories).push_back(mem_attr); - } -} - -} // namespace rnn - -void RecurrentAlgorithm::InferShape(const std::shared_ptr& scope) const { - seq_len_ = scope->GetVariable((arg_->inlinks[0]).external) - ->GetMutable() - ->dims()[0]; - CreateScopes(scope); - auto step_scopes = GetStepScopes(scope); - - // SegmentInputs is called in InferShape. The input must hold memory in - // SegmentInputs. But the other op only set dimension for the output in - // InferShape. That's a problem. Wether the RNN op needs InferShape or not? - // Wether the following functions (SegmentInputs, InitMemories, ...) need - // to rewrite for RNN op? - rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_); - - InitMemories(step_scopes[0]); - - PADDLE_ENFORCE(scope->HasVariable(arg_->step_net), - "stepnet [%s] is not in scope.", - arg_->step_net); - Variable* net = scope->GetVariable(arg_->step_net); - PADDLE_ENFORCE(net != nullptr, "failed to get step net"); - // If the InferShape is called in OperatorBase's run function, - // the rnn op only needs to do InferShape for the first time step - for (size_t i = 0; i < seq_len_; i++) { - if (i > 0) { - rnn::LinkMemories(step_scopes, arg_->memories, i, -1); - } - net->GetMutable()->InferShape(step_scopes[i]); - } - - auto outlinks = arg_->outlinks; - for (size_t i = 0; i < outlinks.size(); i++) { - DDim step_dims = step_scopes[0] - ->GetVariable(outlinks[i].internal) - ->GetMutable() - ->dims(); - std::vector dims_vec = vectorize(step_dims); - // now only support fixed length - dims_vec.insert(dims_vec.begin(), seq_len_); - Tensor* output = - step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable(); - output->Resize(make_ddim(dims_vec)); - } -} - -void RecurrentAlgorithm::Run(const std::shared_ptr& scope, - const platform::DeviceContext& dev_ctx) const { - auto step_scopes = GetStepScopes(scope); - - Variable* net = scope->GetVariable(arg_->step_net); - for (size_t step_id = 0; step_id < seq_len_; step_id++) { - // the link memory is done in InferShape - // maybe remove following code after testing - if (step_id > 0) { - rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1); - } - net->GetMutable()->Run(step_scopes[step_id], dev_ctx); - } - - rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_); -} - -void RecurrentAlgorithm::CreateScopes(std::shared_ptr scope) const { - // TODO(xxx) Only two scopes are needed for inference, this case will be - // supported later. - auto step_scopes = scope->GetVariable(arg_->step_scopes) - ->GetMutable>>(); - - if (seq_len_ > step_scopes->size()) { - for (size_t i = step_scopes->size(); i < seq_len_; ++i) { - std::shared_ptr step_scope = std::make_shared(scope); - - // Now all variables in scope must be created outside of op. - auto net_op = scope->GetVariable(arg_->step_net)->GetMutable(); - for (auto& input : net_op->inputs_) { - step_scope->CreateVariable(input); - } - for (auto& output : net_op->outputs_) { - step_scope->CreateVariable(output); - } - - step_scopes->push_back(std::make_shared(step_scope)); - } - } -} - -void RecurrentAlgorithm::InitMemories(std::shared_ptr step_scope) const { - for (auto& attr : arg_->memories) { - Tensor* pre_mem = - step_scope->CreateVariable(attr.pre_var)->GetMutable(); - PADDLE_ENFORCE(step_scope->HasVariable(attr.boot_var), - "memory [%s]'s boot variable [%s] not exists", - attr.var, - attr.boot_var); - Tensor* boot_mem = - step_scope->GetVariable(attr.boot_var)->GetMutable(); - pre_mem->ShareDataWith(*boot_mem); - - // TODO(qingqing) remove following code - // the memory of current step should be allocated in step net - // here for unit test - auto cur_step_mem = - step_scope->CreateVariable(attr.var)->GetMutable(); - cur_step_mem->mutable_data(boot_mem->dims(), platform::CPUPlace()); - } -} - -const rnn::ArgumentName RecurrentOp::kArgName{"step_net", - "step_scopes", - "inlinks", - "outlinks", - "inlink_alias", - "outlink_alias", - "memories", - "pre_memories", - "boot_memories"}; - -const rnn::ArgumentName RecurrentGradientOp::kArgName{"step_net", - "step_scopes", - "outlink@grad", - "inlink@grad", - "inlink_alias", - "outlink_alias", - "memories", - "pre_memories", - "boot_memories@grad"}; - -void RecurrentOp::Init() { - OperatorBase::Init(); - std::unique_ptr arg(new rnn::Argument()); - rnn::InitArgument(kArgName, arg.get(), *this); - alg_.Init(std::move(arg)); -} - -class RecurrentAlgorithmProtoAndCheckerMaker : public OpProtoAndCheckerMaker { -public: - RecurrentAlgorithmProtoAndCheckerMaker(OpProto* proto, - OpAttrChecker* op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - const auto& name = RecurrentOp::kArgName; - // inputs and outputs stored in proto - AddInputs(name.inlinks, - "the input that need to be segmented for each step."); - AddInputs(name.boot_memories, "variables to initialize memories."); - AddInput(name.step_net, "network shared by all steps."); - - AddOutputs(name.outlinks, - "the output that need to concated for all steps."); - AddOutput(name.step_scopes, "step scopes"); - - // Attributes stored in AttributeMap - AddAttr>(name.inlink_alias, "alias of inlinks"); - AddAttr>(name.outlink_alias, "alias of outlinks"); - AddAttr>(name.pre_memories, - "names of pre-memories"); - AddAttr>(name.memories, "names of memories"); - - AddComment("This is a recurrent group operator."); - } -}; - -void RecurrentGradientAlgorithm::Run( - const std::shared_ptr& scope, - const platform::DeviceContext& dev_ctx) const { - auto step_scopes = GetStepScopes(scope); - rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_); - PADDLE_ENFORCE(scope->HasVariable(arg_->step_net), - "step net is not in scope."); - Variable* net = scope->GetVariable(arg_->step_net); - PADDLE_ENFORCE(net != nullptr, "failed to get step net"); - for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { - if (static_cast(step_id) != seq_len_ - 1) { - rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1); - } - net->GetMutable()->Run(step_scopes[step_id], dev_ctx); - } - LinkBootMemoryGradients(step_scopes[0]); - rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_); -} - -void RecurrentGradientAlgorithm::LinkBootMemoryGradients( - std::shared_ptr step_scope) const { - for (auto& attr : arg_->memories) { - Tensor* mem_grad = - step_scope->CreateVariable(attr.var)->GetMutable(); - PADDLE_ENFORCE(mem_grad != nullptr, - "boot_tensor should be retrieved before"); - PADDLE_ENFORCE(step_scope->HasVariable(attr.boot_var), - "memory [%s]'s boot variable [%s] not exists", - attr.var, - attr.boot_var); - Tensor* boot_mem_grad = - step_scope->CreateVariable(attr.boot_var)->GetMutable(); - boot_mem_grad->ShareDataWith(*mem_grad); - } -} - -void RecurrentGradientAlgorithm::InferShape( - const std::shared_ptr& scope) const { - seq_len_ = scope->GetVariable((arg_->inlinks[0]).external) - ->GetMutable() - ->dims()[0]; - auto step_scopes = GetStepScopes(scope); - rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_); - - PADDLE_ENFORCE(scope->HasVariable(arg_->step_net), - "step net is not in scope."); - Variable* net = scope->GetVariable(arg_->step_net); - PADDLE_ENFORCE(net != nullptr, "failed to get step net"); - - for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { - if (static_cast(step_id) != seq_len_ - 1) { - rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1); - } - net->GetMutable()->InferShape(step_scopes[step_id]); - } - - auto outlinks = arg_->outlinks; - for (size_t i = 0; i < outlinks.size(); i++) { - DDim step_dims = step_scopes[0] - ->GetVariable(outlinks[i].internal) - ->GetMutable() - ->dims(); - std::vector dims_vec = vectorize(step_dims); - // now only support fixed length - dims_vec.insert(dims_vec.begin(), seq_len_); - Tensor* output = - step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable(); - output->Resize(make_ddim(dims_vec)); - } - LinkBootMemoryGradients(step_scopes[0]); -} - -void RecurrentGradientOp::Init() { - OperatorBase::Init(); - std::unique_ptr arg(new rnn::Argument()); - rnn::InitArgument(kArgName, arg.get(), *this); - alg_.Init(std::move(arg)); -} - -} // namespace operators -} // namespace paddle - -REGISTER_OP(recurrent_op, - paddle::operators::RecurrentOp, - paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker); diff --git a/paddle/operators/recurrent_network_op.h b/paddle/operators/recurrent_network_op.h deleted file mode 100644 index 8946c8ce38117c391edcf56558c640ebd0d7f75c..0000000000000000000000000000000000000000 --- a/paddle/operators/recurrent_network_op.h +++ /dev/null @@ -1,216 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once - -#include "paddle/framework/operator.h" - -namespace paddle { -namespace operators { - -using namespace paddle::framework; - -namespace rnn { - -/** - * Memory of a RNN (same as the role of `Momory` in PaddlePaddle). - * - * Memory attributes cached by this op, dims will be infered from - * boot memories in father scope. Other attributes are copied from Op's proto - * attributes. - */ -struct MemoryAttr { - // name of current state variable - std::string var; - // name of previous step's state variable - std::string pre_var; - // name of the variables to init this memory (same role of `boot_layer` in - // PaddlePaddle), which is store in father's scope. - std::string boot_var; -}; - -struct Link { - // input or output links name. - std::string internal; - // alias to avoid duplicate keys in scopes. - std::string external; -}; - -struct Argument { - std::string step_net; - std::string step_scopes; - std::vector inlinks; - std::vector outlinks; - std::vector memories; -}; - -struct ArgumentName { - std::string step_net; - std::string step_scopes; - std::string inlinks; - std::string outlinks; - std::string inlink_alias; // the alias of inlinks in step net. - std::string outlink_alias; // the alias of outlinks in step net. - std::string memories; // the memory name - std::string pre_memories; // the previous memory name - std::string boot_memories; // the boot memory name -}; - -/** - * Prepare inputs for each step net. - */ -void SegmentInputs(std::vector>& step_scopes, - const std::vector& inlinks, - const size_t seq_len); - -/** - * Process outputs of step nets and merge to variables. - */ -void ConcatOutputs(std::vector>& step_scopes, - const std::vector& outlinks, - const size_t seq_len); - -void LinkMemories(std::vector>& step_scopes, - const std::vector& memories, - size_t step_id, - int offset); - -void InitArgument(const ArgumentName& name, Argument* arg); - -}; // namespace rnn - -// The sequence format in RecurrentOp is Tensor now. -// TODO: -// 1. No-padding computing for sequences with indifinite length in one batch. -// 2. Hierarchical RNN for sequence with sub-sequence. -// 3. Internal Memory. -// 4. More Complex RNN architecture, such as Gated Feedback RNN. -// Refer to: https://arxiv.org/pdf/1502.02367.pdf - -class RecurrentAlgorithm { -public: - void Run(const std::shared_ptr& scope, - const platform::DeviceContext& dev_ctx) const; - - void Init(std::unique_ptr arg) { arg_ = std::move(arg); } - - /** - * InferShape must be called before Run. - */ - void InferShape(const std::shared_ptr& scope) const; - -protected: - /* - * The step scopes will be stored in the father scope as a variable. - * - * NOTE the scopes are reused in both the forward and backward, so just - * create once and expand its size if more steps need. - */ - void CreateScopes(std::shared_ptr scope) const; - - inline const std::vector>& GetStepScopes( - std::shared_ptr scope) const { - return *(scope->GetVariable(arg_->step_scopes)) - ->GetMutable>>(); - } - - void InitMemories(std::shared_ptr step_scopes) const; - -private: - std::unique_ptr arg_; - mutable size_t seq_len_; -}; - -class RecurrentGradientAlgorithm { - /** - * RNN's backward alogorithm. - * - * To accelerate the development of RecurrentGradientOp, we decouple RNN's - * algorithm and `OperatorBase`'s implementation, the former contains the core - * implementation of a RNN, and will keep stable even if the framework changes - * a - * lot, and the latter is a wrapper acts like an dapter for it to make RNN an - * operator. - */ -public: - void Init(std::unique_ptr arg) { arg_ = std::move(arg); } - - void Run(const std::shared_ptr& scope, - const platform::DeviceContext& dev_ctx) const; - - void LinkBootMemoryGradients(std::shared_ptr step_scopes) const; - - /** - * InferShape must be called before Run. - */ - void InferShape(const std::shared_ptr& scope) const; - -protected: - inline const std::vector>& GetStepScopes( - std::shared_ptr scope) const { - return *(scope->GetVariable(arg_->step_scopes)) - ->GetMutable>>(); - } - -private: - std::unique_ptr arg_; - mutable size_t seq_len_; -}; - -class RecurrentOp final : public OperatorBase { -public: - void Init() override; - - /** - * InferShape must be called before Run. - */ - virtual void InferShape(const std::shared_ptr& scope) const override { - alg_.InferShape(scope); - } - - virtual void Run(const std::shared_ptr& scope, - const platform::DeviceContext& dev_ctx) const override { - alg_.Run(scope, dev_ctx); - } - - static const rnn::ArgumentName kArgName; - -private: - RecurrentAlgorithm alg_; -}; - -class RecurrentGradientOp final : public OperatorBase { -public: - void Init() override; - - /** - * InferShape must be called before Run. - */ - virtual void InferShape(const std::shared_ptr& scope) const override { - alg_.InferShape(scope); - } - - virtual void Run(const std::shared_ptr& scope, - const platform::DeviceContext& dev_ctx) const override { - alg_.Run(scope, dev_ctx); - } - - static const rnn::ArgumentName kArgName; - -private: - RecurrentGradientAlgorithm alg_; -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..389d4323950269b81912a7213ff64872aafb410f --- /dev/null +++ b/paddle/operators/recurrent_op.cc @@ -0,0 +1,224 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/recurrent_op.h" + +#include +#include +#include + +#include "paddle/framework/op_registry.h" +#include "paddle/operators/net_op.h" +#include "paddle/platform/enforce.h" + +namespace paddle { +namespace operators { + +void RecurrentAlgorithm::InferShape(const Scope& scope) const { + seq_len_ = scope.FindVar((arg_->inlinks[0]).external) + ->GetMutable() + ->dims()[0]; + CreateScopes(scope); + auto step_scopes = GetStepScopes(scope); + rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_, + true /*infer_shape_mode*/); + InitMemories(step_scopes[0], true /*infer_shape_mode*/); + Variable* net = scope.FindVar(arg_->step_net); + PADDLE_ENFORCE(net != nullptr, "failed to get step net"); + for (size_t i = 0; i < seq_len_; i++) { + if (i > 0) { + rnn::LinkMemories(step_scopes, arg_->memories, i, -1, + true /*infer_shape_mode*/); + } + net->GetMutable()->InferShape(*step_scopes[i]); + } + rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_, + true /*infer_shape_mode*/); +} + +void RecurrentAlgorithm::Run(const Scope& scope, + const platform::DeviceContext& dev_ctx) const { + auto step_scopes = GetStepScopes(scope); + rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_, + false /*infer_shape_mode*/); + InitMemories(step_scopes[0], false /*infer_shape_mode*/); + Variable* net = scope.FindVar(arg_->step_net); + + for (size_t step_id = 0; step_id < seq_len_; step_id++) { + if (step_id > 0) { + rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1, + false /*infer_shape_mode*/); + } + net->GetMutable()->Run(*step_scopes[step_id], dev_ctx); + } + rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_, + false /*infer_shape_mode*/); +} + +void RecurrentAlgorithm::CreateScopes(const Scope& scope) const { + // TODO(xxx) Only two scopes are needed for inference, this case will be + // supported later. + auto step_scopes = + scope.FindVar(arg_->step_scopes)->GetMutable>(); + + if (seq_len_ > step_scopes->size()) { + for (size_t i = step_scopes->size(); i < seq_len_; ++i) { + auto& step_scope = scope.NewScope(); + + // Now all variables in scope must be created outside of op. + auto net_op = scope.FindVar(arg_->step_net)->GetMutable(); + for (auto& input : net_op->inputs_) { + // the weight are located in parent scope + if (!step_scope.FindVar(input)) step_scope.NewVar(input); + } + for (auto& output : net_op->outputs_) { + step_scope.NewVar(output); + } + step_scopes->emplace_back(&step_scope); + } + } +} + +void RecurrentAlgorithm::InitMemories(Scope* step_scope, + bool infer_shape_mode) const { + for (auto& attr : arg_->memories) { + Tensor* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable(); + PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr, + "memory [%s]'s boot variable [%s] not exists", attr.var, + attr.boot_var); + Tensor* boot_mem = step_scope->FindVar(attr.boot_var)->GetMutable(); + if (infer_shape_mode) { + pre_mem->Resize(boot_mem->dims()); + } else { + pre_mem->ShareDataWith(*boot_mem); + } + } +} + +const rnn::ArgumentName RecurrentOp::kArgName{ + "step_net", "step_scopes", "inlinks", + "outlinks", "inlink_alias", "outlink_alias", + "memories", "pre_memories", "boot_memories"}; + +const rnn::ArgumentName RecurrentGradientOp::kArgName{ + "step_net", "step_scopes", "outlink@grad", + "inlink@grad", "inlink_alias", "outlink_alias", + "memories", "pre_memories", "boot_memories@grad"}; + +void RecurrentOp::Init() { + OperatorBase::Init(); + std::unique_ptr arg(new rnn::Argument()); + rnn::InitArgument(kArgName, arg.get(), *this); + alg_.Init(std::move(arg)); +} + +class RecurrentAlgorithmProtoAndCheckerMaker : public OpProtoAndCheckerMaker { + public: + RecurrentAlgorithmProtoAndCheckerMaker(OpProto* proto, + OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + const auto& name = RecurrentOp::kArgName; + // inputs and outputs stored in proto + AddInput(name.inlinks, + "the inputs that need to be segmented for each step.") + .SetMultiple(); + AddInput(name.boot_memories, "variables to initialize memories.") + .SetMultiple(); + AddInput(name.step_net, "network shared by all steps."); + + AddOutput(name.outlinks, "the outputs that need to concated for all steps.") + .SetMultiple(); + AddOutput(name.step_scopes, "step scopes"); + + // Attributes stored in AttributeMap + AddAttr>(name.inlink_alias, "alias of inlinks"); + AddAttr>(name.outlink_alias, "alias of outlinks"); + AddAttr>(name.pre_memories, + "names of pre-memories"); + AddAttr>(name.memories, "names of memories"); + + AddComment("This is a recurrent group operator."); + } +}; + +void RecurrentGradientAlgorithm::Run( + const Scope& scope, const platform::DeviceContext& dev_ctx) const { + auto step_scopes = GetStepScopes(scope); + rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_, + false /*infer_shape_mode*/); + Variable* net = scope.FindVar(arg_->step_net); + PADDLE_ENFORCE(net != nullptr, "failed to get step net"); + for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { + if (static_cast(step_id) != seq_len_ - 1) { + rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1, + false /*infer_shape_mode*/); + } + net->GetMutable()->Run(*step_scopes[step_id], dev_ctx); + } + LinkBootMemoryGradients(step_scopes[0], false); + rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_, + false /*infer_shape_mode*/); +} + +void RecurrentGradientAlgorithm::LinkBootMemoryGradients( + Scope* step_scope, bool infer_shape_mode) const { + for (auto& attr : arg_->memories) { + PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr, + "memory variable [%s] does not exists", attr.var); + PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr, + "boot variable [%s] does not exists", attr.boot_var); + Tensor* mem_grad = step_scope->NewVar(attr.var)->GetMutable(); + Tensor* boot_mem_grad = + step_scope->NewVar(attr.boot_var)->GetMutable(); + if (infer_shape_mode) { + boot_mem_grad->Resize(mem_grad->dims()); + } else { + boot_mem_grad->ShareDataWith(*mem_grad); + } + } +} + +void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const { + seq_len_ = scope.FindVar((arg_->inlinks[0]).external) + ->GetMutable() + ->dims()[0]; + auto step_scopes = GetStepScopes(scope); + rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_, + true /*infer_shape_mode*/); + Variable* net = scope.FindVar(arg_->step_net); + PADDLE_ENFORCE(net != nullptr, "failed to get step net"); + for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { + if (static_cast(step_id) != seq_len_ - 1) { + rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1, + true /*infer_shape_mode*/); + } + net->GetMutable()->InferShape(*step_scopes[step_id]); + } + rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_, + true /*infer_shape_mode*/); + LinkBootMemoryGradients(step_scopes[0], true /*infer_shape_mode*/); +} + +void RecurrentGradientOp::Init() { + OperatorBase::Init(); + std::unique_ptr arg(new rnn::Argument()); + rnn::InitArgument(kArgName, arg.get(), *this); + alg_.Init(std::move(arg)); +} + +} // namespace operators +} // namespace paddle + +REGISTER_OP(recurrent_op, paddle::operators::RecurrentOp, + paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker); diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h new file mode 100644 index 0000000000000000000000000000000000000000..d1e60fed9cef3c6dccba3ad498fc3658a177b3f7 --- /dev/null +++ b/paddle/operators/recurrent_op.h @@ -0,0 +1,147 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/framework/operator.h" +#include "paddle/operators/rnn/recurrent_op_utils.h" + +namespace paddle { +namespace operators { + +// The sequence format in RecurrentOp is Tensor now. +// TODO(Yan Chunwei): +// 1. No-padding computing for sequences with indifinite length in one batch. +// 2. Hierarchical RNN for sequence with sub-sequence. +// 3. Internal Memory. +// 4. More Complex RNN architecture, such as Gated Feedback RNN. +// Refer to: https://arxiv.org/pdf/1502.02367.pdf + +class RecurrentAlgorithm { + public: + void Run(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const; + + void Init(std::unique_ptr arg) { arg_ = std::move(arg); } + + /** + * InferShape must be called before Run. + */ + void InferShape(const framework::Scope& scope) const; + + protected: + /* + * The step scopes will be stored in the father scope as a variable. + * + * NOTE the scopes are reused in both the forward and backward, so just + * create once and expand its size if more steps need. + */ + void CreateScopes(const framework::Scope& scope) const; + + const std::vector& GetStepScopes( + const framework::Scope& scope) const { + return *scope.FindVar(arg_->step_scopes) + ->GetMutable>(); + } + + void InitMemories(framework::Scope* step_scopes, bool infer_shape_mode) const; + + private: + std::unique_ptr arg_; + mutable size_t seq_len_; +}; + +class RecurrentGradientAlgorithm { + /** + * RNN's backward alogorithm. + * + * To accelerate the development of RecurrentGradientOp, we decouple RNN's + * algorithm and `OperatorBase`'s implementation, the former contains the core + * implementation of a RNN, and will keep stable even if the framework changes + * a + * lot, and the latter is a wrapper acts like an dapter for it to make RNN an + * operator. + */ + public: + void Init(std::unique_ptr arg) { arg_ = std::move(arg); } + + void Run(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const; + + void LinkBootMemoryGradients(framework::Scope* step_scopes, + bool infer_shape_mode) const; + + /** + * InferShape must be called before Run. + */ + void InferShape(const framework::Scope& scope) const; + + protected: + inline const std::vector& GetStepScopes( + const framework::Scope& scope) const { + return *scope.FindVar(arg_->step_scopes) + ->GetMutable>(); + } + + private: + std::unique_ptr arg_; + mutable size_t seq_len_; +}; + +class RecurrentOp final : public framework::OperatorBase { + public: + void Init() override; + + /** + * InferShape must be called before Run. + */ + void InferShape(const framework::Scope& scope) const override { + alg_.InferShape(scope); + } + + void Run(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const override { + alg_.Run(scope, dev_ctx); + } + + static const rnn::ArgumentName kArgName; + + private: + RecurrentAlgorithm alg_; +}; + +class RecurrentGradientOp final : public framework::OperatorBase { + public: + void Init() override; + + /** + * InferShape must be called before Run. + */ + void InferShape(const framework::Scope& scope) const override { + alg_.InferShape(scope); + } + + void Run(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const override { + alg_.Run(scope, dev_ctx); + } + + static const rnn::ArgumentName kArgName; + + private: + RecurrentGradientAlgorithm alg_; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/recurrent_network_op_test.cc b/paddle/operators/recurrent_op_test.cc similarity index 67% rename from paddle/operators/recurrent_network_op_test.cc rename to paddle/operators/recurrent_op_test.cc index 6784ac6001ad1b464d65814cff1ad6247826ad66..3607d14bf875dc2892fbbdc4dbc9ccf87c1b9784 100644 --- a/paddle/operators/recurrent_network_op_test.cc +++ b/paddle/operators/recurrent_op_test.cc @@ -11,20 +11,25 @@ limitations under the License. */ +#include "paddle/operators/recurrent_op.h" + #include #include -#include "paddle/framework/net.h" +#include "paddle/framework/ddim.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" #include "paddle/framework/tensor.h" -#include "paddle/operators/recurrent_network_op.h" +#include "paddle/operators/net_op.h" namespace paddle { namespace operators { +using framework::make_ddim; +using framework::DDim; + class RecurrentOpTest : public ::testing::Test { -protected: + protected: virtual void SetUp() override { CreateGlobalVariables(); CreateStepNet(); @@ -34,45 +39,44 @@ protected: virtual void TearDown() override {} void CreateGlobalVariables() { - scope_ = std::make_shared(); // create input, and init content LOG(INFO) << "create global variable x"; for (auto inlink : std::vector{"x", "x0", "x1", "h"}) { - Variable* x = scope_->CreateVariable(inlink); + Variable* x = scope_.NewVar(inlink); DDim dims = make_ddim(std::vector{ 10 /*sent size*/, 20 /*batch size*/, 30 /*input dim*/}); x->GetMutable()->mutable_data(dims, platform::CPUPlace()); } // create output alias just for test for (auto inlink : std::vector{"h@alias"}) { - Variable* x = scope_->CreateVariable(inlink); + Variable* x = scope_.NewVar(inlink); DDim dims = make_ddim(std::vector{20 /*batch size*/, 30 /*input dim*/}); x->GetMutable()->mutable_data(dims, platform::CPUPlace()); } LOG(INFO) << "create global variable w"; - Variable* w = scope_->CreateVariable("rnn/w"); + Variable* w = scope_.NewVar("rnn/w"); w->GetMutable()->mutable_data( make_ddim(std::vector{30, 30}), platform::CPUPlace()); - for (auto boot : std::vector{"x_boot", "h_boot"}) { + for (auto boot : std::vector{"h_boot"}) { LOG(INFO) << "create global variable " << boot; - Variable* h_boot = scope_->CreateVariable(boot); + Variable* h_boot = scope_.NewVar(boot); h_boot->GetMutable()->mutable_data( make_ddim(std::vector{20 /*batch size*/, 30 /*input dim*/}), platform::CPUPlace()); } LOG(INFO) << "create variable step_scopes"; - scope_->CreateVariable("step_scopes"); + scope_.NewVar("step_scopes"); LOG(INFO) << "create variable h"; - scope_->CreateVariable("h"); + scope_.NewVar("h"); } void CreateRNNOp() { - OpDesc op_desc; + framework::OpDesc op_desc; op_desc.set_type("recurrent_op"); // inlinks 0 @@ -80,7 +84,6 @@ protected: op_desc.add_inputs("x0"); op_desc.add_inputs("x1"); // boot_memories 3 - op_desc.add_inputs("x_boot"); op_desc.add_inputs("h_boot"); // step net 5 op_desc.add_inputs("step_net"); @@ -92,7 +95,7 @@ protected: auto _input_format = std::vector{ 0, // in_link 3, // memories - 5 // step_net + 4 // step_net }; auto input_format = op_desc.add_attrs(); input_format->set_name("input_format"); @@ -130,12 +133,11 @@ protected: inlink_alias->add_strings(item); } // pre memories - for (const auto& item : - std::vector{"rnn/x@pre", "rnn/h@pre"}) { + for (const auto& item : std::vector{"rnn/h@pre"}) { pre_memories->add_strings(item); } // memories - for (const auto& item : std::vector{"rnn/x", "rnn/h"}) { + for (const auto& item : std::vector{"rnn/h"}) { memories->add_strings(item); } // output alias @@ -150,21 +152,18 @@ protected: void CreateStepNet() { LOG(INFO) << "create variable step_net"; - Variable* var = scope_->CreateVariable("step_net"); + Variable* var = scope_.NewVar("step_net"); auto net = var->GetMutable(); - // rnn/s is net's input or output? - net->inputs_ = {"rnn/h@pre", "rnn/w", "rnn/x"}; - net->inputs_ = {"rnn/s", "rnn/h"}; net->AddOp( OpRegistry::CreateOp("mul", {"rnn/h@pre", "rnn/w"}, {"rnn/s"}, {})); net->AddOp( - OpRegistry::CreateOp("add_two", {"rnn/x", "rnn/s"}, {"rnn/h"}, {})); + OpRegistry::CreateOp("add_two", {"x@alias", "rnn/s"}, {"rnn/h"}, {})); net->CompleteAddOp(); } // father scope - std::shared_ptr scope_; + Scope scope_; std::shared_ptr rnn_op_; }; @@ -175,7 +174,7 @@ TEST_F(RecurrentOpTest, Run) { } class RecurrentGradientAlgorithmTest : public ::testing::Test { -protected: + protected: virtual void SetUp() override { CreateGlobalVariables(); CreateStepScopes(); @@ -191,68 +190,64 @@ protected: virtual void TearDown() override {} void CreateGlobalVariables() { - scope_ = std::make_shared(); // inputs: x LOG(INFO) << "create global variable x"; - Variable* x = scope_->CreateVariable("x"); + Variable* x = scope_.NewVar("x"); DDim dims = make_ddim({10 /*sent size*/, 20 /*batch size*/, 30 /*input dim*/}); x->GetMutable()->mutable_data(dims, platform::CPUPlace()); // inputs: h_boot LOG(INFO) << "create global variable h_boot"; - Variable* h_boot = scope_->CreateVariable("h_boot"); + Variable* h_boot = scope_.NewVar("h_boot"); h_boot->GetMutable()->mutable_data( make_ddim({20 /*batch size*/, 30 /*input dim*/}), platform::CPUPlace()); // inputs: w LOG(INFO) << "create global variable w"; - Variable* w = scope_->CreateVariable("rnn/w"); + Variable* w = scope_.NewVar("rnn/w"); w->GetMutable()->mutable_data(make_ddim({30, 30}), platform::CPUPlace()); // inputs: h_grad LOG(INFO) << "create variable h_grad"; - Variable* dh = scope_->CreateVariable("h_grad"); + Variable* dh = scope_.NewVar("h_grad"); dh->GetMutable()->mutable_data(make_ddim({10, 20, 30}), platform::CPUPlace()); // inputs: step_scopes LOG(INFO) << "create variable step_scopes"; - scope_->CreateVariable("step_scopes"); + scope_.NewVar("step_scopes"); // inputs: step_net LOG(INFO) << "create variable step_net"; - scope_->CreateVariable("step_net"); + scope_.NewVar("step_net"); // outputs: w_grad LOG(INFO) << "create global variable w_grad"; - scope_->CreateVariable("rnn/w_grad"); + scope_.NewVar("rnn/w_grad"); // outputs: x_grad LOG(INFO) << "create global variable x_grad"; - scope_->CreateVariable("x_grad"); + scope_.NewVar("x_grad"); // outputs: h_boot_grad LOG(INFO) << "create global variable h_boot_grad"; - scope_->CreateVariable("h_boot_grad"); + scope_.NewVar("h_boot_grad"); } void CreateStepScopes() { - std::vector>* step_scopes = - scope_->GetVariable("step_scopes") - ->GetMutable>>(); + auto step_scopes = + scope_.FindVar("step_scopes")->GetMutable>(); for (int i = 0; i < 10; ++i) { - auto scope = std::make_shared(scope_); - auto pre_t = scope->CreateVariable("rnn/pre_h")->GetMutable(); - pre_t->mutable_data(make_ddim({20, 30}), platform::CPUPlace()); - auto tensor = scope->CreateVariable("rnn/h")->GetMutable(); - tensor->mutable_data(make_ddim({20, 30}), platform::CPUPlace()); + auto& scope = scope_.NewScope(); + auto pre_t = scope.NewVar("rnn/pre_h")->GetMutable(); + pre_t->mutable_data({20, 30}, platform::CPUPlace()); + auto tensor = scope.NewVar("rnn/h")->GetMutable(); + tensor->mutable_data({20, 30}, platform::CPUPlace()); // for unit test of ConcatOutputs - auto xg = scope->CreateVariable("rnn/x_grad")->GetMutable(); - xg->mutable_data(make_ddim({20, 30}), platform::CPUPlace()); + auto xg = scope.NewVar("rnn/x_grad")->GetMutable(); + xg->mutable_data({20, 30}, platform::CPUPlace()); - step_scopes->push_back(scope); + step_scopes->emplace_back(&scope); } // last time step - auto g = (*step_scopes)[9] - ->CreateVariable("rnn/h_pre_grad") - ->GetMutable(); - g->mutable_data(make_ddim({20, 30}), platform::CPUPlace()); + auto g = (*step_scopes)[9]->NewVar("rnn/h_pre_grad")->GetMutable(); + g->mutable_data({20, 30}, platform::CPUPlace()); } void CreateRNNGradientAlgorithm() { @@ -280,15 +275,13 @@ protected: void CreateStepNet() { LOG(INFO) << "create variable step_net"; - Variable* var = scope_->CreateVariable("step_net"); + Variable* var = scope_.NewVar("step_net"); auto net = var->GetMutable(); - net->AddOp(OpRegistry::CreateOp("mul", - {"rnn/h_pre", "rnn/w", "rnn/s_grad"}, - {"rnn/h_pre_grad", "rnn/w_grad"}, - {})); + net->AddOp(OpRegistry::CreateOp("mul", {"rnn/h_pre", "rnn/w", "rnn/s_grad"}, + {"rnn/h_pre_grad", "rnn/w_grad"}, {})); - net->AddOp(OpRegistry::CreateOp( - "add_two", {"rnn/h_grad"}, {"rnn/x_grad", "rnn/s_grad"}, {})); + net->AddOp(OpRegistry::CreateOp("add_two", {"rnn/h_grad"}, + {"rnn/x_grad", "rnn/s_grad"}, {})); net->CompleteAddOp(); } @@ -300,10 +293,10 @@ protected: rnn::Link inlink; inlink.external = "x"; inlink.internal = "rnn/x"; - std::vector>* step_scopes = - scope_->GetVariable("step_scopes") - ->GetMutable>>(); - rnn::SegmentInputs(*step_scopes, std::vector{inlink}, 10); + auto step_scopes = + scope_.FindVar("step_scopes")->GetMutable>(); + rnn::SegmentInputs(*step_scopes, std::vector{inlink}, 10, + true /*infer_shape_mode*/); } void LinkeMemories() { @@ -314,15 +307,15 @@ protected: mem_attr.boot_var = "boot_h"; std::vector memories; memories.push_back(mem_attr); - std::vector>* step_scopes = - scope_->GetVariable("step_scopes") - ->GetMutable>>(); + auto step_scopes = + scope_.FindVar("step_scopes")->GetMutable>(); for (int i = 1; i < 10; ++i) { - rnn::LinkMemories(*step_scopes, memories, i, -1); + rnn::LinkMemories(*step_scopes, memories, i, -1, + true /*infer_shape_mode*/); } } - std::shared_ptr scope_; + Scope scope_; RecurrentGradientAlgorithm rnn_grad_algo_; }; @@ -340,15 +333,15 @@ TEST(RecurrentOp, LinkMemories) { using namespace paddle::operators; // create and init step scopes - int len = 10; - std::vector> step_scopes; - for (int i = 0; i < len; ++i) { - auto scope = std::make_shared(); - scope->CreateVariable("pre_h"); - auto tensor = scope->CreateVariable("h")->GetMutable(); - float* data = tensor->mutable_data(make_ddim({15, 20}), CPUPlace()); - for (int i = 0; i < 15 * 20; ++i) { - data[i] = rand() * (1. / (double)RAND_MAX); + size_t len = 10; + std::vector step_scopes; + for (size_t i = 0; i < len; ++i) { + auto scope = new Scope(); + scope->NewVar("pre_h"); + auto tensor = scope->NewVar("h")->GetMutable(); + float* data = tensor->mutable_data({15, 20}, CPUPlace()); + for (size_t j = 0; j < 15 * 20; ++j) { + data[j] = rand() * (1. / (double)RAND_MAX); } step_scopes.push_back(scope); } @@ -361,40 +354,41 @@ TEST(RecurrentOp, LinkMemories) { std::vector memories; memories.push_back(mem_attr); - for (int i = 1; i < len; ++i) { - rnn::LinkMemories(step_scopes, memories, i, -1); + for (size_t i = 1; i < len; ++i) { + rnn::LinkMemories(step_scopes, memories, i, -1, false /*infer_shape_mode*/); } // check - for (int i = 0; i < len - 1; ++i) { + for (size_t i = 0; i < len - 1; ++i) { const float* a = - step_scopes[i]->GetVariable("h")->GetMutable()->data(); + step_scopes[i]->FindVar("h")->GetMutable()->data(); const float* b = step_scopes[i + 1] - ->GetVariable("pre_h") + ->FindVar("pre_h") ->GetMutable() ->data(); - for (size_t i = 0; i < 15 * 20; ++i) { - ASSERT_FLOAT_EQ(a[i], b[i]); + for (size_t j = 0; j < 15 * 20; ++j) { + ASSERT_FLOAT_EQ(a[j], b[j]); } } for (int i = len - 2; i >= 0; --i) { - rnn::LinkMemories(step_scopes, memories, i, 1); + rnn::LinkMemories(step_scopes, memories, i, 1, false /*infer_shape_mode*/); } // check for (int i = len - 2; i >= 0; --i) { - const float* a = step_scopes[i] - ->GetVariable("pre_h") - ->GetMutable() - ->data(); - const float* b = step_scopes[i + 1] - ->GetVariable("h") - ->GetMutable() - ->data(); - for (size_t i = 0; i < 15 * 20; ++i) { - ASSERT_FLOAT_EQ(a[i], b[i]); + const float* a = + step_scopes[i]->FindVar("pre_h")->GetMutable()->data(); + const float* b = + step_scopes[i + 1]->FindVar("h")->GetMutable()->data(); + for (size_t j = 0; j < 15 * 20; ++j) { + ASSERT_FLOAT_EQ(a[j], b[j]); } } + + for (auto s : step_scopes) { + delete s; + } } USE_OP(add_two); USE_OP(mul); +USE_OP_WITHOUT_KERNEL(recurrent_op); diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..43c97ba29f637828d717ac82516769deff52c7da --- /dev/null +++ b/paddle/operators/rnn/recurrent_op_utils.cc @@ -0,0 +1,157 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/rnn/recurrent_op_utils.h" + +namespace paddle { +namespace operators { +namespace rnn { + +namespace fmw = paddle::framework; + +void SegmentInputs(const std::vector& step_scopes, + const std::vector& inlinks, const size_t seq_len, + bool infer_shape_mode) { + PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided."); + for (size_t i = 0; i < inlinks.size(); ++i) { + auto input_var = step_scopes[0]->FindVar(inlinks[i].external); + PADDLE_ENFORCE(input_var != nullptr, "input link [%s] is not in scope.", + inlinks[i].external); + + Tensor* input = input_var->GetMutable(); + fmw::DDim dims = input->dims(); + PADDLE_ENFORCE(static_cast(dims[0]) == seq_len, + "all the inlinks must have same length"); + fmw::DDim step_dims = slice_ddim(dims, 1, dims.size()); + for (size_t j = 0; j < seq_len; j++) { + Tensor* step_input = + step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable(); + if (!infer_shape_mode) { + *step_input = input->Slice(j, j + 1); + } + step_input->Resize(step_dims); + } + } +} + +void ConcatOutputs(const std::vector& step_scopes, + const std::vector& outlinks, const size_t seq_len, + bool infer_shape_mode) { + for (size_t i = 0; i < outlinks.size(); i++) { + auto output_var = step_scopes[0]->FindVar(outlinks[i].external); + PADDLE_ENFORCE(output_var != nullptr, "output link [%s] is not in scope.", + outlinks[i].external); + Tensor* output = output_var->GetMutable(); + if (infer_shape_mode) { + fmw::DDim step_dims = step_scopes[0] + ->FindVar(outlinks[i].internal) + ->GetMutable() + ->dims(); + std::vector dims_vec = vectorize(step_dims); + dims_vec.insert(dims_vec.begin(), seq_len); + output->Resize(fmw::make_ddim(dims_vec)); + } else { + output->mutable_data(platform::CPUPlace()); + for (size_t j = 0; j < seq_len; j++) { + Tensor* step_output = + step_scopes[j]->FindVar(outlinks[i].internal)->GetMutable(); + // TODO(luotao02) data type and platform::DeviceContext() should set + // correctly + (output->Slice(j, j + 1)) + .CopyFrom(*step_output, platform::CPUPlace()); + } + } + } +} + +void LinkMemories(const std::vector& scopes, + const std::vector& memories, + const size_t step_id, const int offset, + bool infer_shape_mode) { + PADDLE_ENFORCE(step_id < scopes.size(), + "step [%d] is out of range of step scopes' size [%d]", step_id, + scopes.size()); + PADDLE_ENFORCE(static_cast(step_id) + offset >= 0, + "offset [%d] must be large than -[%d]", offset, step_id); + PADDLE_ENFORCE(step_id + offset < scopes.size(), + "offset [%d] is out of range, it must be less than (%d - %d)", + offset, scopes.size(), step_id); + auto scope = scopes[step_id]; + auto linked_scope = scopes[step_id + offset]; + for (auto& attr : memories) { + auto mem = scope->FindVar(attr.pre_var)->GetMutable(); + auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable(); + if (infer_shape_mode) { + mem->Resize(linked_mem->dims()); + } else { + mem->ShareDataWith(*linked_mem); + } + } +} + +void InitArgument(const ArgumentName& name, Argument* arg, + const OperatorBase& op) { + arg->step_net = op.Input(name.step_net); + arg->step_scopes = op.Output(name.step_scopes); + + auto inlinks = op.Inputs(name.inlinks); + auto inlink_alias = op.GetAttr>(name.inlink_alias); + PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(), + "the size of inlinks and inlink_alias don't match:%d,%d", + inlinks.size(), inlink_alias.size()); + for (size_t i = 0; i < inlinks.size(); ++i) { + rnn::Link link; + link.external = inlinks[i]; + link.internal = inlink_alias[i]; + (arg->inlinks).push_back(link); + } + + auto outlinks = op.Outputs(name.outlinks); + auto outlink_alias = op.GetAttr>(name.outlink_alias); + PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(), + "the size of outlinks and outlink_alias don't match:%d,%d", + outlinks.size(), outlink_alias.size()); + for (size_t i = 0; i < outlinks.size(); ++i) { + rnn::Link link; + link.external = outlinks[i]; + link.internal = outlink_alias[i]; + (arg->outlinks).push_back(link); + } + + auto boot_memories = op.Inputs(name.boot_memories); + + // attributes + auto memories = op.GetAttr>(name.memories); + auto pre_memories = op.GetAttr>(name.pre_memories); + + PADDLE_ENFORCE(memories.size() == boot_memories.size(), + "the size of memories, boot_memories don't match:%d,%d", + memories.size(), boot_memories.size()); + PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(), + "the size of pre_memories, boot_memories don't match:%d,%d", + pre_memories.size(), boot_memories.size()); + PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set"); + + for (size_t i = 0; i < memories.size(); ++i) { + rnn::MemoryAttr mem_attr; + mem_attr.var = memories[i]; + mem_attr.pre_var = pre_memories[i]; + mem_attr.boot_var = boot_memories[i]; + (arg->memories).push_back(mem_attr); + } +} + +} // namespace rnn +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..379754b98fcead6debe0a60efa62fce4b7761940 --- /dev/null +++ b/paddle/operators/rnn/recurrent_op_utils.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include + +#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" + +namespace paddle { +namespace operators { +namespace rnn { + +/** + * Memory of a RNN (same as the role of `Momory` in PaddlePaddle). + * + * Memory attributes cached by this op, dims will be infered from + * boot memories in father scope. Other attributes are copied from Op's proto + * attributes. + */ +struct MemoryAttr { + // name of current state variable + std::string var; + // name of previous step's state variable + std::string pre_var; + // name of the variables to init this memory (same role of `boot_layer` in + // PaddlePaddle), which is store in father's scope. + std::string boot_var; +}; + +struct Link { + // input or output links name. + std::string internal; + // alias to avoid duplicate keys in scopes. + std::string external; +}; + +struct Argument { + std::string step_net; + std::string step_scopes; + std::vector inlinks; + std::vector outlinks; + std::vector memories; +}; + +struct ArgumentName { + std::string step_net; + std::string step_scopes; + std::string inlinks; + std::string outlinks; + std::string inlink_alias; // the alias of inlinks in step net. + std::string outlink_alias; // the alias of outlinks in step net. + std::string memories; // the memory name + std::string pre_memories; // the previous memory name + std::string boot_memories; // the boot memory name +}; + +/** + * Prepare inputs for each step net. + */ +void SegmentInputs(const std::vector& step_scopes, + const std::vector& inlinks, const size_t seq_len, + bool infer_shape_mode); + +/** + * Process outputs of step nets and merge to variables. + */ +void ConcatOutputs(const std::vector& step_scopes, + const std::vector& outlinks, const size_t seq_len, + bool infer_shape_mode); + +void LinkMemories(const std::vector& step_scopes, + const std::vector& memories, const size_t step_id, + const int offset, bool infer_shape_mode); + +void InitArgument(const ArgumentName& name, Argument* arg, + const OperatorBase& op); + +} // namespace rnn +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc index 4129422fa744b2a7cf135b681efa73ffb2ebcdcc..8d1a36f2b332faad516ced012a409ca428bbf689 100644 --- a/paddle/operators/rowwise_add_op.cc +++ b/paddle/operators/rowwise_add_op.cc @@ -17,23 +17,23 @@ namespace paddle { namespace operators { class RowWiseAddOp : public OperatorWithKernel { -protected: - void InferShape(const std::vector &inputs, - const std::vector &outputs) const override { - PADDLE_ENFORCE(inputs.size() == 2UL, "Two inputs is needed by rowwise add"); - auto dim0 = inputs[0]->dims(); - auto dim1 = inputs[1]->dims(); + protected: + void InferShape(const InferShapeContext &ctx) const override { + PADDLE_ENFORCE(ctx.InputSize() == 2UL, + "Two inputs is needed by rowwise add"); + auto dim0 = ctx.Input(0)->dims(); + auto dim1 = ctx.Input(1)->dims(); PADDLE_ENFORCE(dim0.size() == 2, "Input 0 must be matrix"); PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector"); PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same"); - PADDLE_ENFORCE(outputs.size() == 1, "The output size must be 1"); - outputs[0]->Resize(inputs[0]->dims()); + PADDLE_ENFORCE(ctx.OutputSize() == 1, "The output size must be 1"); + ctx.Output(0)->Resize(ctx.Input(0)->dims()); } }; class RowWiseAddOpMaker : public OpProtoAndCheckerMaker { -public: + public: RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The left input of row-wise add op, must be matrix"); diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu index 4b33e38ebabe853e179fe70ef7fde0a80b9050e2..f76faa0a3a93a1ac277a1d1aa83c3fa6c3944648 100644 --- a/paddle/operators/rowwise_add_op.cu +++ b/paddle/operators/rowwise_add_op.cu @@ -1,3 +1,18 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU #include "paddle/operators/rowwise_add_op.h" REGISTER_OP_GPU_KERNEL(rowwise_add, diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h index 4596925e9322f373c822608fd9aa6ecee6144d4c..b52524c47c7b80d8ddc6a94a4a6d03db8034088d 100644 --- a/paddle/operators/rowwise_add_op.h +++ b/paddle/operators/rowwise_add_op.h @@ -20,22 +20,20 @@ namespace operators { template class RowWiseAddKernel : public OpKernel { -public: - void Compute(const KernelContext& context) const override { - auto in0 = context.Input(0)->Get(); - auto in1 = context.Input(1)->Get(); - auto* out = context.Output(0)->GetMutable(); + public: + void Compute(const ExecutionContext& context) const override { + auto out = context.Output(0); out->mutable_data(context.GetPlace()); - auto input = EigenMatrix::From(in0); - auto bias = EigenVector::From(in1); + auto input = EigenMatrix::From(*context.Input(0)); + auto bias = EigenVector::From(*context.Input(1)); auto output = EigenMatrix::From(*out); const int bias_size = bias.dimension(0); const int rest_size = input.size() / bias_size; Eigen::DSizes one_d(input.size()); Eigen::DSizes bcast(rest_size); - output.reshape(one_d).device(*(context.GetEigenDevice())) = + output.reshape(one_d).device(context.GetEigenDevice()) = input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d); } }; diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc index f6c654a9e7083704e353c276e0abc975f4e61ef9..6307583f4ee3f185845690d0e378945d066eae75 100644 --- a/paddle/operators/sgd_op.cc +++ b/paddle/operators/sgd_op.cc @@ -18,22 +18,21 @@ namespace paddle { namespace operators { class SGDOp : public OperatorWithKernel { -protected: - void InferShape(const std::vector &inputs, - const std::vector &outputs) const override { - PADDLE_ENFORCE(inputs.size() == 2, "Input size of SGDOp must be two"); - PADDLE_ENFORCE(outputs.size() == 1, "Output size of SGDOp must be one"); - PADDLE_ENFORCE(inputs[0] != nullptr, "inputs[0] mast be set"); - PADDLE_ENFORCE(inputs[1] != nullptr, "inputs[1] mast be set"); - PADDLE_ENFORCE(outputs[0] != nullptr, "outputs[0] mast be set"); - PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(), + protected: + void InferShape(const InferShapeContext &ctx) const override { + PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of SGDOp must be two"); + PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of SGDOp must be one"); + PADDLE_ENFORCE(ctx.InputVar(0) != nullptr, "inputs[0] mast be set"); + PADDLE_ENFORCE(ctx.InputVar(1) != nullptr, "inputs[1] mast be set"); + PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr, "outputs[0] mast be set"); + PADDLE_ENFORCE(ctx.Input(0)->dims() == ctx.Input(1)->dims(), "Two input of SGD Op's dimension must be same."); - outputs[0]->Resize(inputs[0]->dims()); + ctx.Output(0)->Resize(ctx.Input(0)->dims()); } }; class SGDOpMaker : public OpProtoAndCheckerMaker { -public: + public: SGDOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("param", "input parameter"); diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu index f8f5b90cab460b4457cfb0a88bfc012bafe0fbc2..72629ccfbb8bc8ec53045289bd985c721c62fa10 100644 --- a/paddle/operators/sgd_op.cu +++ b/paddle/operators/sgd_op.cu @@ -1,3 +1,18 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU #include "paddle/operators/sgd_op.h" -REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel); \ No newline at end of file +REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel); diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h index 65179d323bd991b8b4e196c069a11cd901c62082..bf5b195933fce7faa46bcc96032e784076178cf7 100644 --- a/paddle/operators/sgd_op.h +++ b/paddle/operators/sgd_op.h @@ -20,17 +20,21 @@ namespace operators { template class SGDOpKernel : public OpKernel { -public: - void Compute(const KernelContext& ctx) const override { - auto param = ctx.Input("param")->Get(); - auto grad = ctx.Input("grad")->Get(); - auto* param_out = ctx.Output(0)->GetMutable(); + public: + void Compute(const ExecutionContext& ctx) const override { + auto param = ctx.Input("param"); + auto grad = ctx.Input("grad"); + auto param_out = ctx.Output(0); float lr = ctx.op_.GetAttr("learning_rate"); param_out->mutable_data(ctx.GetPlace()); - EigenVector::Flatten(*param_out).device(*(ctx.GetEigenDevice())) = - EigenVector::Flatten(param) - lr * EigenVector::Flatten(grad); + auto p = EigenVector::Flatten(*param); + auto g = EigenVector::Flatten(*grad); + auto o = EigenVector::Flatten(*param_out); + auto place = ctx.GetEigenDevice(); + + o.device(place) = p - lr * g; } }; diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc index 716f1d9c4dbc45e2d5569f8d634b06fd988a149c..9d201eb93a2c0e34dd8e6869e97b43c4e278596e 100644 --- a/paddle/operators/sigmoid_op.cc +++ b/paddle/operators/sigmoid_op.cc @@ -17,17 +17,16 @@ namespace paddle { namespace operators { class SigmoidOp : public OperatorWithKernel { -protected: - void InferShape(const std::vector &inputs, - const std::vector &outputs) const override { - PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input"); - PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output"); - outputs[0]->Resize(inputs[0]->dims()); + protected: + void InferShape(const InferShapeContext &ctx) const override { + PADDLE_ENFORCE(ctx.InputSize() == 1, "Sigmoid Op only have one input"); + PADDLE_ENFORCE(ctx.OutputSize() == 1, "Sigmoid Op only have one output"); + ctx.Output(0)->Resize(ctx.Input(0)->dims()); } }; class SigmoidOpMaker : public OpProtoAndCheckerMaker { -public: + public: SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "sigmoid input"); @@ -37,9 +36,8 @@ public: }; class SigmoidOpGrad : public OperatorWithKernel { -protected: - void InferShape(const std::vector &inputs, - const std::vector &outputs) const override {} + protected: + void InferShape(const InferShapeContext &ctx) const override {} std::string DebugString() const override { LOG(INFO) << "SigmoidGrad"; return ""; diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu index f679b20418f04eff4310efe4e121963ce5a235e0..2123b17e4b5e90c22c2d6e9177f2a8956f8a4ac9 100644 --- a/paddle/operators/sigmoid_op.cu +++ b/paddle/operators/sigmoid_op.cu @@ -1,3 +1,18 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU #include "paddle/operators/sigmoid_op.h" REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel); diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h index 896a6f5d83e0f96de50e3aaae6f545172bf5da14..eb473920a5f866825b52ecb946653ccead7000ea 100644 --- a/paddle/operators/sigmoid_op.h +++ b/paddle/operators/sigmoid_op.h @@ -21,16 +21,17 @@ namespace operators { template class SigmoidKernel : public OpKernel { -public: - void Compute(const KernelContext& context) const override { - auto input = context.Input(0)->Get(); - auto* output = context.Output(0)->GetMutable(); - + public: + void Compute(const ExecutionContext& context) const override { + auto input = context.Input(0); + auto output = context.Output(0); output->mutable_data(context.GetPlace()); - EigenVector::Flatten(*output).device( - *(context.GetEigenDevice())) = - 1.0 / (1.0 + (-1.0 * EigenVector::Flatten(input)).exp()); + auto X = EigenVector::Flatten(*input); + auto Y = EigenVector::Flatten(*output); + auto place = context.GetEigenDevice(); + + Y.device(place) = 1.0 / (1.0 + (-1.0 * X).exp()); } }; } // namespace operators diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc index df60b62fa6ac8d67c9dadc40ec49aaedab92bc88..a070458f5e55cf47253ab0df5af7a1163b4f8092 100644 --- a/paddle/operators/softmax_op.cc +++ b/paddle/operators/softmax_op.cc @@ -1,36 +1,37 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ #include "paddle/operators/softmax_op.h" namespace paddle { namespace operators { class SoftmaxOp : public OperatorWithKernel { -protected: - void InferShape(const std::vector &inputs, - const std::vector &outputs) const override { - PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax"); - PADDLE_ENFORCE(inputs[0]->dims().size() == 2, + protected: + void InferShape(const InferShapeContext &ctx) const override { + PADDLE_ENFORCE(ctx.InputSize() == 1UL, + "Only one input is need for softmax"); + PADDLE_ENFORCE(ctx.Input("X")->dims().size() == 2UL, "The input of softmax op must be matrix"); - PADDLE_ENFORCE(outputs.size() == 1, "Only one output is need for softmax"); - - outputs[0]->Resize(inputs[0]->dims()); + PADDLE_ENFORCE(ctx.OutputSize() == 1UL, + "Only one output is need for softmax"); + ctx.Output("Y")->Resize(ctx.Input("X")->dims()); } }; class SoftmaxOpMaker : public OpProtoAndCheckerMaker { -public: + public: SoftmaxOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "input of softmax"); @@ -40,12 +41,20 @@ public: }; class SoftmaxOpGrad : public OperatorWithKernel { -protected: - void InferShape(const std::vector &inputs, - const std::vector &outputs) const override {} - std::string DebugString() const override { - LOG(INFO) << "SoftmaxOpGrad"; - return ""; + protected: + void InferShape(const InferShapeContext &ctx) const override { + PADDLE_ENFORCE(ctx.InputSize() == 3UL, + "Input of SoftmaxOpGrad should be 3, X, Y, YG"); + PADDLE_ENFORCE(ctx.OutputSize() == 1UL, + "Output of SoftmaxOpGrad should be 1"); + PADDLE_ENFORCE(ctx.InputVar("Y") != nullptr, "Input(Y) should not be null"); + PADDLE_ENFORCE(ctx.InputVar(framework::GradVarName("Y")) != nullptr, + "Input(Y@GRAD) should not be null"); + PADDLE_ENFORCE(ctx.Input("Y")->dims() == + ctx.Input(framework::GradVarName("Y"))->dims(), + "the shape of Input(0) and Input(1) should be the same"); + ctx.Output(framework::GradVarName("X")) + ->Resize(ctx.Input("Y")->dims()); } }; @@ -53,5 +62,7 @@ protected: } // namespace paddle REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker); -REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad); REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel); +REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad); +REGISTER_OP_CPU_KERNEL(softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu index a1f6944a369fe5148ffcfeabf3bf7063dcbc2664..b79228580a7ea0f70b62eb2dc7a61cf85bc0b5fb 100644 --- a/paddle/operators/softmax_op.cu +++ b/paddle/operators/softmax_op.cu @@ -1,4 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU #include "paddle/framework/op_registry.h" #include "paddle/operators/softmax_op.h" REGISTER_OP_GPU_KERNEL(softmax, ops::SoftmaxKernel); +REGISTER_OP_GPU_KERNEL(softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h index 625a87b58560231572c1cca2a21bd0c47c8cb296..b2dbcf57edf1a64da8da0d9a4c14d708eec17f3f 100644 --- a/paddle/operators/softmax_op.h +++ b/paddle/operators/softmax_op.h @@ -1,19 +1,22 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #pragma once +#include "paddle/framework/ddim.h" +#include "paddle/framework/operator.h" +#include "paddle/framework/tensor.h" #include "paddle/operators/type_alias.h" namespace paddle { @@ -21,13 +24,13 @@ namespace operators { template class SoftmaxKernel : public OpKernel { -public: - void Compute(const KernelContext& context) const override { - auto input = context.Input(0)->Get(); - auto* output = context.Output(0)->GetMutable(); + public: + void Compute(const ExecutionContext& context) const override { + auto input = context.Input("X"); + auto output = context.Output("Y"); output->mutable_data(context.GetPlace()); - auto logits = EigenMatrix::From(input); + auto logits = EigenMatrix::From(*input); auto softmax = EigenMatrix::From(*output); const int kBatchDim = 0; @@ -46,9 +49,9 @@ public: .reshape(batch_by_one) .broadcast(one_by_class)); - softmax.device(*(context.GetEigenDevice())) = shifted_logits.exp(); + softmax.device(context.GetEigenDevice()) = shifted_logits.exp(); - softmax.device(*(context.GetEigenDevice())) = + softmax.device(context.GetEigenDevice()) = (softmax * softmax.sum(along_class) .inverse() @@ -57,5 +60,38 @@ public: .broadcast(one_by_class)); } }; + +template +class SoftmaxGradKernel : public OpKernel { + public: + void Compute(const ExecutionContext& context) const override { + std::shared_ptr scale_ = std::make_shared(); + + auto Y = context.Input("Y"); + auto dY = context.Input(framework::GradVarName("Y")); + auto dX = context.Output(framework::GradVarName("X")); + dX->mutable_data(context.GetPlace()); + + const int batch_size = Y->dims()[0]; + const int class_num = Y->dims()[1]; + + Eigen::DSizes along_class(1); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, class_num); + + auto Y_eigen = EigenMatrix::From(*Y); + auto dY_eigen = EigenMatrix::From(*dY); + auto dX_eigen = EigenMatrix::From(*dX); + auto place = context.GetEigenDevice(); + + auto dot = (Y_eigen * dY_eigen) + .sum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class); + dX_eigen.device(place) = (dY_eigen - dot) * Y_eigen; + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/operators/type_alias.h b/paddle/operators/type_alias.h index b712e457ff60e8b30b87c0d549693d53e9f05d59..eac12d35dd8d2977191218167ebb0a6e638d5d73 100644 --- a/paddle/operators/type_alias.h +++ b/paddle/operators/type_alias.h @@ -15,36 +15,40 @@ #pragma once #include "paddle/framework/eigen.h" -#include "paddle/framework/net.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/net_op.h" namespace paddle { namespace operators { using OpKernel = framework::OpKernel; -using KernelContext = framework::KernelContext; -template +using EigenScalar = framework::EigenScalar; +template using EigenVector = framework::EigenVector; -template using EigenMatrix = framework::EigenMatrix; -template using EigenTensor = framework::EigenTensor; using Tensor = framework::Tensor; +using Scope = framework::Scope; using OperatorWithKernel = framework::OperatorWithKernel; +using OperatorBase = framework::OperatorBase; using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker; using OpProto = framework::OpProto; using OpAttrChecker = framework::OpAttrChecker; using CPUPlace = platform::CPUPlace; using GPUPlace = platform::GPUPlace; -using NetOp = framework::NetOp; using OpRegistry = framework::OpRegistry; + } // namespace operators } // namespace paddle diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index f5182707611318669b98e6f7b49344604bf7ab4a..450213c34afc8757b9dbc9a512e45a68745ae988 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -42,7 +42,7 @@ class CPUDeviceContext : public DeviceContext { public: typedef std::mt19937 random_generator_type; CPUDeviceContext(); - CPUDeviceContext(CPUPlace); + explicit CPUDeviceContext(CPUPlace); virtual ~CPUDeviceContext() {} Eigen::DefaultDevice* eigen_device() const; @@ -80,10 +80,10 @@ class CUDADeviceContext : public DeviceContext { // clang-format off /*! \brief Return cublas handle in the device context. */ - cublasHandle_t cublas_handle (); + cublasHandle_t cublas_handle(); /*! \brief Return cudnn handle in the device context. */ - cudnnHandle_t cudnn_handle (); + cudnnHandle_t cudnn_handle(); /*! \brief Return curand handle in the device context. */ curandGenerator_t curand_generator(); diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc index af2ce17fc2238dda62e9888ebe9426edcd55d2bc..65345c433c0a328e7f89038a39312edba35eb8c7 100644 --- a/paddle/platform/device_context_test.cc +++ b/paddle/platform/device_context_test.cc @@ -15,24 +15,28 @@ limitations under the License. */ #include "paddle/platform/device_context.h" #include "gtest/gtest.h" -using DEVICE_GPU = Eigen::GpuDevice; TEST(Device, Init) { + using paddle::platform::DeviceContext; + using paddle::platform::CUDADeviceContext; + using paddle::platform::GPUPlace; + int count = paddle::platform::GetDeviceCount(); for (int i = 0; i < count; i++) { - paddle::platform::DeviceContext* device_context = - new paddle::platform::CUDADeviceContext(i); + DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); Eigen::GpuDevice* gpu_device = - device_context->template get_eigen_device(); + device_context->template get_eigen_device(); ASSERT_NE(nullptr, gpu_device); delete device_context; } } TEST(Device, CUDADeviceContext) { + using paddle::platform::CUDADeviceContext; + using paddle::platform::GPUPlace; + int count = paddle::platform::GetDeviceCount(); for (int i = 0; i < count; i++) { - paddle::platform::CUDADeviceContext* device_context = - new paddle::platform::CUDADeviceContext(i); + CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); Eigen::GpuDevice* gpu_device = device_context->eigen_device(); ASSERT_NE(nullptr, gpu_device); cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc index 4e3dfdaefb2348346e8f917b1f6c758bf6d91a1a..9cd2a1f565526f8dc45932ba6168f4e25c6ad238 100644 --- a/paddle/platform/dynload/cublas.cc +++ b/paddle/platform/dynload/cublas.cc @@ -1,3 +1,17 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #include namespace paddle { diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc index 8b5e15b5efcdae6a1eed09f002eb2f4f2163035f..d3e4cb567d71b987724366b6a0896f5df0eb6055 100644 --- a/paddle/platform/dynload/cudnn.cc +++ b/paddle/platform/dynload/cudnn.cc @@ -1,3 +1,17 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #include namespace paddle { @@ -25,4 +39,4 @@ CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); } // namespace dynload } // namespace platform -} // namespace paddle \ No newline at end of file +} // namespace paddle diff --git a/paddle/platform/dynload/curand.cc b/paddle/platform/dynload/curand.cc index 5c1fab992c98569d4a95b6e699d97d428511e48e..d05dd88126bfee7278e553710a717b8f2eb02ae0 100644 --- a/paddle/platform/dynload/curand.cc +++ b/paddle/platform/dynload/curand.cc @@ -1,3 +1,17 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #include namespace paddle { @@ -10,6 +24,7 @@ void *curand_dso_handle; #define DEFINE_WRAP(__name) DynLoad__##__name __name CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); -} -} -} \ No newline at end of file + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h index fd4adbd9deca12ad6c3a59cfd5d30fb0cb6fcf98..bc0715656a7d61774d53d4a0643ec1c105706085 100644 --- a/paddle/platform/enforce.h +++ b/paddle/platform/enforce.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include #include @@ -39,12 +41,22 @@ namespace platform { struct EnforceNotMet : public std::exception { std::exception_ptr exp_; std::string err_str_; - EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) { + static constexpr int TRACE_STACK_LIMIT = 100; try { std::rethrow_exception(exp_); } catch (const std::exception& exp) { - err_str_ = string::Sprintf("%s at [%s:%d]", exp.what(), f, l); + std::ostringstream sout; + sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl; + sout << "Call Stacks: " << std::endl; + void* call_stack[TRACE_STACK_LIMIT]; + int sz = backtrace(call_stack, TRACE_STACK_LIMIT); + auto line = backtrace_symbols(call_stack, sz); + for (int i = 0; i < sz; ++i) { + sout << line[i] << std::endl; + } + free(line); + err_str_ = sout.str(); } } @@ -132,12 +144,12 @@ inline void throw_on_error(T e) { throw_on_error(e, ""); } -#define PADDLE_THROW(...) \ - do { \ - throw ::paddle::platform::EnforceNotMet( \ - std::make_exception_ptr( \ - std::runtime_error(string::Sprintf(__VA_ARGS__))), \ - __FILE__, __LINE__); \ +#define PADDLE_THROW(...) \ + do { \ + throw ::paddle::platform::EnforceNotMet( \ + std::make_exception_ptr( \ + std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \ + __FILE__, __LINE__); \ } while (0) #define PADDLE_ENFORCE(...) \ @@ -150,5 +162,50 @@ inline void throw_on_error(T e) { } \ } while (0) +/* + * Some enforce helpers here, usage: + * int a = 1; + * int b = 2; + * PADDLE_ENFORCE_EQ(a, b); + * + * will raise an expression described as follows: + * "enforce a == b failed, 1 != 2" with detailed stack infomation. + * + * extra messages is also supported, for example: + * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) + */ + +#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__) +#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__) +#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__) +#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__) +#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) +#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) + +// if two values have different data types, choose a compatible type for them. +template +struct CompatibleType { + static const bool t1_to_t2 = std::is_convertible::value; + typedef typename std::conditional::type type; +}; + +#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ + PADDLE_ENFORCE(__COMPATIBLE_TYPE(__VAL0, __VAL1, __VAL0) \ + __CMP __COMPATIBLE_TYPE(__VAL0, __VAL1, __VAL1), \ + "enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \ + #__VAL0, #__VAL1, std::to_string(__VAL0), \ + std::to_string(__VAL1), \ + paddle::string::Sprintf("" __VA_ARGS__)); + +#define __COMPATIBLE_TYPE(__VAL0, __VAL1, __VAL) \ + typename paddle::platform::CompatibleType::type(__VAL) + } // namespace platform } // namespace paddle diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc index 2ac31812a80d8dd57ce82234cb5835e029a46067..7117b49474044af08ae9db79c2fae6693e966af2 100644 --- a/paddle/platform/enforce_test.cc +++ b/paddle/platform/enforce_test.cc @@ -34,3 +34,165 @@ TEST(ENFORCE, FAILED) { } ASSERT_TRUE(in_catch); } + +TEST(ENFORCE, NO_ARG_OK) { + int a = 2; + int b = 2; + PADDLE_ENFORCE_EQ(a, b); + // test enforce with extra message. + PADDLE_ENFORCE_EQ(a, b, "some thing wrong %s", "info"); +} + +TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) { + int a = 2; + bool in_catch = false; + + try { + PADDLE_ENFORCE_EQ(a, 1 + 3); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = "enforce a == 1 + 3 failed, 2 != 4"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} + +TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) { + int a = 2; + bool in_catch = false; + + try { + PADDLE_ENFORCE_EQ(a, 1 + 3, "%s size not match", "their"); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = + "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} + +TEST(ENFORCE_NE, OK) { + PADDLE_ENFORCE_NE(1, 2); + PADDLE_ENFORCE_NE(1.0, 2UL); +} +TEST(ENFORCE_NE, FAIL) { + bool in_catch = false; + + try { + // 2UL here to check data type compatible + PADDLE_ENFORCE_NE(1.0, 1UL); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = "enforce 1.0 != 1UL failed, 1.000000 == 1"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} + +TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); } +TEST(ENFORCE_GT, FAIL) { + bool in_catch = false; + + try { + // 2UL here to check data type compatible + PADDLE_ENFORCE_GT(1, 2UL); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = "enforce 1 > 2UL failed, 1 <= 2"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} + +TEST(ENFORCE_GE, OK) { + PADDLE_ENFORCE_GE(2, 2UL); + PADDLE_ENFORCE_GE(3, 2UL); + PADDLE_ENFORCE_GE(3, 2); + PADDLE_ENFORCE_GE(3.21, 2UL); +} +TEST(ENFORCE_GE, FAIL) { + bool in_catch = false; + + try { + PADDLE_ENFORCE_GE(1, 2UL); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = "enforce 1 >= 2UL failed, 1 < 2"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} + +TEST(ENFORCE_LE, OK) { + PADDLE_ENFORCE_LE(1, 1); + PADDLE_ENFORCE_LE(1, 1UL); + PADDLE_ENFORCE_LE(2, 3UL); + PADDLE_ENFORCE_LE(2UL, 3); + PADDLE_ENFORCE_LE(2UL, 3.2); +} +TEST(ENFORCE_LE, FAIL) { + bool in_catch = false; + + try { + PADDLE_ENFORCE_GT(1, 2UL); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = "enforce 1 > 2UL failed, 1 <= 2"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} + +TEST(ENFORCE_LT, OK) { + PADDLE_ENFORCE_LT(3, 10); + PADDLE_ENFORCE_LT(2, 3UL); + PADDLE_ENFORCE_LT(2UL, 3); +} +TEST(ENFORCE_LT, FAIL) { + bool in_catch = false; + + try { + PADDLE_ENFORCE_LT(1UL, 0.12); + + } catch (paddle::platform::EnforceNotMet error) { + in_catch = true; + const std::string msg = "enforce 1UL < 0.12 failed, 1 >= 0.12"; + const char* what = error.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + + ASSERT_TRUE(in_catch); +} diff --git a/paddle/platform/place.h b/paddle/platform/place.h index 7cead183884bc9379355cd931921b40d6c11ce90..a82e8c942fa28297d91056a66b61f085f2bdb946 100644 --- a/paddle/platform/place.h +++ b/paddle/platform/place.h @@ -32,7 +32,7 @@ struct CPUPlace { struct GPUPlace { GPUPlace() : GPUPlace(0) {} - GPUPlace(int d) : device(d) {} + explicit GPUPlace(int d) : device(d) {} // needed for variant equality comparison inline bool operator==(const GPUPlace &o) const { return device == o.device; } diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index a8994366bc34ee3ec2c39e3482fc3757b089e61f..925f2eb670ca7553257467096ec0cd4b88b46fa4 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -1,3 +1,10 @@ -cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python - add_op fc_op sgd_op cross_entropy_op random_op) - add_op fc_op sgd_op cross_entropy_op recurrent_network_op guassian_random_op) +cc_library(paddle_pybind SHARED + SRCS pybind.cc + DEPS pybind python backward + fc_op + sgd_op + add_op + mean_op + guassian_random_op + cross_entropy_op + recurrent_op) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 3860facb099950a5287d3f6b89c3de38f588f568..ede9e210245df740f13ebb32c98313554f522dd9 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -39,6 +39,10 @@ Configuring cmake in /paddle/build ... -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ======================================== EOF + +# Disable UNITTEST_USE_VIRTUALENV in docker because +# docker environment is fully controlled by this script. +# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option. cmake .. \ -DCMAKE_BUILD_TYPE=Release \ -DWITH_DOC=OFF \ @@ -52,39 +56,43 @@ cmake .. \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON cat <> /paddle/build/Dockerfile < /dev/null -SCRIPTPATH=$PWD -popd > /dev/null - -USE_VIRTUALENV_FOR_TEST=$1; shift -PYTHON=$1; shift - -if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then - rm -rf .test_env - virtualenv .test_env - unset PYTHONHOME - unset PYTHONPATH - source .test_env/bin/activate - PYTHON=python -fi - -$PYTHON -m pip install $SCRIPTPATH/../dist/*.whl - -if [ "X${PADDLE_PACKAGE_DIR}" != "X" ]; then - $PYTHON -m pip install ${PADDLE_PACKAGE_DIR}/*.whl -else - export PYTHONPATH=$SCRIPTPATH/../../python/ -fi - -$PYTHON -m pip install ipython==5.3 - -for fn in "$@" -do - echo "test $fn" - $PYTHON $fn - if [ $? -ne 0 ]; then - exit 1 - fi -done - -if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then - deactivate - rm -rf .test_env -fi diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh index a44385158042a23eca175df261852148642f7fa0..33fb5d84e2701c163b5d1b1bb3362ee81ebb34ea 100755 --- a/paddle/scripts/travis/build_doc.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -6,14 +6,14 @@ mkdir -p $TRAVIS_BUILD_DIR/build cd $TRAVIS_BUILD_DIR/build # Compile paddle binaries first -cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_GOLANG=ON -DWITH_STYLE_CHECK=OFF +cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_GOLANG=ON -DWITH_STYLE_CHECK=OFF mkdir output make -j `nproc` find .. -name '*whl' | xargs pip install # install all wheels. rm -rf * # Compile Documentation only. -cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON +cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON make -j `nproc` paddle_docs paddle_docs_cn # check websites for broken links diff --git a/paddle/setup.py.in b/paddle/setup.py.in index 06d55d3abc6097fa7d4b2b2ac9e29681e0fddfd5..af107e76723135124e56db52a76e4f8aff5c4acf 100644 --- a/paddle/setup.py.in +++ b/paddle/setup.py.in @@ -22,7 +22,9 @@ setup(name="py_paddle", package_data={'py_paddle':['*.py','_swig_paddle.so']}, install_requires = [ 'nltk>=3.2.2', - 'numpy>=1.8.0', # The numpy is required. + # We use `numpy.flip` in `test_image.py`. + # `numpy.flip` is introduced in `1.12.0` + 'numpy>=1.12.0', # The numpy is required. 'protobuf==${PROTOBUF_VERSION}' # The paddle protobuf version ], url='http://www.paddlepaddle.org/', diff --git a/paddle/string/piece.h b/paddle/string/piece.h index 0272529d1c9b2cb6000a26f1d4d80276d06bf27b..03ae9243a4cc4e9e92e376bf46ab2b1d7162dfcb 100644 --- a/paddle/string/piece.h +++ b/paddle/string/piece.h @@ -39,8 +39,8 @@ public: // size_ is 0. Piece(); Piece(const char* d, size_t n); - Piece(const char* d); - Piece(const std::string& s); + Piece(const char* d); // NOLINT: accept C string into Piece. + Piece(const std::string& s); // NOLINT: accept C++ string into Piece. const char* data() const { return data_; } size_t len() const { return size_; } diff --git a/paddle/trainer/tests/compare_sparse_data b/paddle/trainer/tests/compare_sparse_data new file mode 100644 index 0000000000000000000000000000000000000000..18fc6541383d8e8e1687b8fe1abd57aece3d4cfc Binary files /dev/null and b/paddle/trainer/tests/compare_sparse_data differ diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data similarity index 100% rename from paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto rename to paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist index 8b041cd66416862a78dba27368a65860a68ef1a5..6b406dff0ba91b5f310d7eafa111c0d21d6542c3 100644 --- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist +++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist @@ -1 +1 @@ -./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto +./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data diff --git a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf new file mode 100644 index 0000000000000000000000000000000000000000..92f32a18c0068ab4672034a270aa8c52f2716d59 --- /dev/null +++ b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf @@ -0,0 +1,154 @@ +#edit-mode: -*- python -*- +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later. + +# Note: when making change to this file, please make sure +# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest +# for comparing these two nets can pass (test_CompareTwoNets) + +default_initial_std(0.1) +default_device(0) + +word_dim = 999 +l1 = 0 +l2 = 0 + +model_type("nn") + +sparse_update = get_config_arg("sparse_update", bool, False) + +TrainData(ProtoData( + type = "proto_sequence", + files = ('trainer/tests/train_sparse.list'), + )) + +Settings( + algorithm='sgd', + batch_size=100, + learning_rate=0.0001, + learning_rate_decay_a=4e-08, + learning_rate_decay_b=0.0, + learning_rate_schedule='poly', +) + + +wordvec_dim = 32 +layer2_dim = 16 +layer3_dim = 16 +hidden_dim = 32 + +slot_names = ["qb", "qw", "tb", "tw"] + +def ltr_network(network_name, + word_dim=word_dim, + wordvec_dim=wordvec_dim, + layer2_dim=layer2_dim, + layer3_dim=layer3_dim, + hidden_dim=hidden_dim, + slot_names=slot_names, + l1=l1, + l2=l2): + + slotnum = len(slot_names) + for i in xrange(slotnum): + Inputs(slot_names[i] + network_name) + for i in xrange(slotnum): + Layer( + name = slot_names[i] + network_name, + type = "data", + size = word_dim, + device = -1, + ) + Layer( + name = slot_names[i] + "_embedding_" + network_name, + type = "mixed", + size = wordvec_dim, + bias = False, + device = -1, + inputs = TableProjection(slot_names[i] + network_name, + parameter_name = "embedding.w0", + decay_rate_l1=l1, + sparse_remote_update = True, + sparse_update = sparse_update, + ), + ) + Layer( + name = slot_names[i] + "_rnn1_" + network_name, + type = "recurrent", + active_type = "tanh", + bias = Bias(initial_std = 0, + parameter_name = "rnn1.bias"), + inputs = Input(slot_names[i] + "_embedding_" + network_name, + parameter_name = "rnn1.w0") + ) + Layer( + name = slot_names[i] + "_rnnlast_" + network_name, + type = "seqlastins", + inputs = [ + slot_names[i] + "_rnn1_" + network_name, + ], + ) + + Layer( + name = "layer2_" + network_name, + type = "fc", + active_type = "tanh", + size = layer2_dim, + bias = Bias(parameter_name = "layer2.bias"), + inputs = [Input(slot_name + "_rnnlast_" + network_name, + parameter_name = "_layer2_" + slot_name + ".w", + decay_rate = l2, + initial_smart = True) for slot_name in slot_names] + ) + Layer( + name = "layer3_" + network_name, + type = "fc", + active_type = "tanh", + size = layer3_dim, + bias = Bias(parameter_name = "layer3.bias"), + inputs = [ + Input("layer2_" + network_name, + parameter_name = "_layer3.w", + decay_rate = l2, + initial_smart = True), + ] + ) + Layer( + name = "output_" + network_name, + type = "fc", + size = 1, + bias = False, + inputs = [ + Input("layer3_" + network_name, + parameter_name = "_layerO.w"), + ], + ) + + +ltr_network("left") +ltr_network("right") +Inputs("label") +Layer( + name = "label", + type = "data", + size = 1, + ) +Outputs("cost", "qb_rnnlast_left") +Layer( + name = "cost", + type = "rank-cost", + inputs = ["output_left", "output_right", "label"], + ) diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp index a7000eb77e1bbeab4f6e38c0322f82bde7164080..813275518e411d6e963e23df634541f771096e0f 100644 --- a/paddle/trainer/tests/test_CompareSparse.cpp +++ b/paddle/trainer/tests/test_CompareSparse.cpp @@ -23,7 +23,7 @@ using namespace paddle; // NOLINT using namespace std; // NOLINT static const string& configFile1 = - "trainer/tests/sample_trainer_config_qb_rnn.conf"; + "trainer/tests/sample_trainer_config_compare_sparse.conf"; DECLARE_bool(use_gpu); DECLARE_string(config); diff --git a/paddle/trainer/tests/train_sparse.list b/paddle/trainer/tests/train_sparse.list new file mode 100644 index 0000000000000000000000000000000000000000..6ea020e2202f8464f8a647cd96c84a9d17a03ae3 --- /dev/null +++ b/paddle/trainer/tests/train_sparse.list @@ -0,0 +1 @@ +trainer/tests/compare_sparse_data diff --git a/proto/DataConfig.proto b/proto/DataConfig.proto index e895c184d9f95dba1449e6467a2566712837600b..0cb5d7afbb3e1cb4abe45c0ed677e09b27b870fa 100644 --- a/proto/DataConfig.proto +++ b/proto/DataConfig.proto @@ -15,14 +15,13 @@ syntax = "proto2"; package paddle; - message FileGroupConf { - optional uint32 queue_capacity = 1 [default = 1]; + optional uint32 queue_capacity = 1 [ default = 1 ]; // how many files to load for a load file thread - optional int32 load_file_count = 2 [default = 1]; + optional int32 load_file_count = 2 [ default = 1 ]; // how many threads to load files // Setting to be 5~10 is appropriate when loading files by hadoop vfs - optional int32 load_thread_num = 3 [default = 1]; + optional int32 load_thread_num = 3 [ default = 1 ]; }; message DataConfig { @@ -32,26 +31,28 @@ message DataConfig { // name of a text file which contains a list of file names at each line optional string files = 3; - optional int32 feat_dim = 4;//feature dimension of one frame - repeated int32 slot_dims = 5;//feature slot dims - optional int32 context_len = 6;//max neibour frame numbers - optional uint64 buffer_capacity = 7;//the number of samples + optional int32 feat_dim = 4; // feature dimension of one frame + repeated int32 slot_dims = 5; // feature slot dims + optional int32 context_len = 6; // max neibour frame numbers + optional uint64 buffer_capacity = 7; // the number of samples - //part of data used in training - //if not -1, part of train data is used in training - optional int64 train_sample_num = 8 [default = -1]; + // part of data used in training + // if not -1, part of train data is used in training + optional int64 train_sample_num = 8 [ default = -1 ]; - //The number of documents processed once - optional int32 file_load_num = 9 [default = -1]; - optional bool async_load_data = 12 [default = false]; + // The number of documents processed once + optional int32 file_load_num = 9 [ default = -1 ]; + optional bool async_load_data = 12 [ default = false ]; /// Note the field number 10, 11 and 13 have been deprecated. - optional bool for_test = 14 [default = false]; // whether this data is for test + optional bool for_test = 14 + [ default = false ]; // whether this data is for test optional FileGroupConf file_group_conf = 15; repeated int32 float_slot_dims = 16; /// Note the field number 17, 18 and 19 have been deprecated. - // a list of values which will be used to create additional one dimensional float + // a list of values which will be used to create additional one dimensional + // float // values slots. These one dimensional slots can be used as the weight input // for cost layers. // Currently this is only supported by ProtoDataProvider. @@ -65,21 +66,21 @@ message DataConfig { // for MultiDataProvider repeated DataConfig sub_data_configs = 24; // sub dataproviders - /* - * the ratio of each sub dataproviders: - * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100, - * then each mini-batch is combined by 10 instance from A and 90 instances - * from B. - */ + /* + * the ratio of each sub dataproviders: + * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100, + * then each mini-batch is combined by 10 instance from A and 90 instances + * from B. + */ optional int32 data_ratio = 25; /* * if one of the sub dataproviders is running out of data, then * (1) it is "main data", then finish current pass. * (2) it is not "main data", then reset it, and try getNextBatch again. */ - optional bool is_main_data = 26 [default = true]; + optional bool is_main_data = 26 [ default = true ]; - // the usage ratio of instances. Setting to 1.0 means the use of all instances. - optional double usage_ratio = 27 [default = 1.0]; + // the usage ratio of instances. Setting to 1.0 means the use of all + // instances. + optional double usage_ratio = 27 [ default = 1.0 ]; }; - diff --git a/proto/DataFormat.proto b/proto/DataFormat.proto index 19b1499b0281a1b92028cc8944c27ee4d56b8dd2..7d963bc29f7c6b9895323b0d57ba4ee4cb4387d0 100644 --- a/proto/DataFormat.proto +++ b/proto/DataFormat.proto @@ -17,27 +17,32 @@ package paddle; /* If values is not empty and ids is empty, this is a dense vector. - If values is not empty and ids is not empty, this is a sparse vector. The position of each value + If values is not empty and ids is not empty, this is a sparse vector. The + position of each value is specified by ids. - If values is empty and ids is not empty, this is a sparse vector whose non-zero values are 1. + If values is empty and ids is not empty, this is a sparse vector whose non-zero + values are 1. The position of each 1 is specified by ids. */ message VectorSlot { - repeated float values = 1 [packed = true]; - repeated uint32 ids = 2 [packed = true]; + repeated float values = 1 [ packed = true ]; + repeated uint32 ids = 2 [ packed = true ]; /* For multidimensional data, for example "image width height depth" */ - repeated uint32 dims = 3 [packed = true]; - repeated string strs = 4; + repeated uint32 dims = 3 [ packed = true ]; + repeated string strs = 4; }; /* - SubseqSlot use to record whether VectorSlot or any other slot in future has subseq. - If not all VectorSlot have subseq, we only store the one who has subseq, and use *slot_id* to record it. - One vector_slots has one sequence, and it may have N subseq, thus the number of *lens* will be N too. + SubseqSlot use to record whether VectorSlot or any other slot in future has + subseq. + If not all VectorSlot have subseq, we only store the one who has subseq, and + use *slot_id* to record it. + One vector_slots has one sequence, and it may have N subseq, thus the number of + *lens* will be N too. */ message SubseqSlot { - required uint32 slot_id = 1; //the id of slot who has subseq - repeated uint32 lens = 2; // lengths of sub-sequence in the slot + required uint32 slot_id = 1; // the id of slot who has subseq + repeated uint32 lens = 2; // lengths of sub-sequence in the slot }; message SlotDef { @@ -45,13 +50,14 @@ message SlotDef { VECTOR_DENSE = 0; VECTOR_SPARSE_NON_VALUE = 1; VECTOR_SPARSE_VALUE = 2; - INDEX = 3; // This can be used as label, or word id, etc. + INDEX = 3; // This can be used as label, or word id, etc. VAR_MDIM_DENSE = 4; VAR_MDIM_INDEX = 5; STRING = 6; } required SlotType type = 1; - required uint32 dim = 2; // For INDEX slots, this means the maximal index plus 1. + required uint32 dim = + 2; // For INDEX slots, this means the maximal index plus 1. }; message DataHeader { @@ -60,11 +66,11 @@ message DataHeader { }; message DataSample { - optional bool is_beginning = 1 [default = true]; // is the beginning of a sequence + optional bool is_beginning = 1 + [ default = true ]; // is the beginning of a sequence repeated VectorSlot vector_slots = 2; - repeated uint32 id_slots = 3 [packed = true]; + repeated uint32 id_slots = 3 [ packed = true ]; /* use ids of VectorSlot */ repeated VectorSlot var_id_slots = 4; repeated SubseqSlot subseq_slots = 5; }; - diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index 83f72c137bdf5e55f28be908321bd2ccd6c906fe..4f3d5bf3f6cb96c97285f40e3a3d100c2af47ad5 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -21,7 +21,6 @@ package paddle; * Various structs for the configuration of a neural network */ - message ExternalConfig { repeated string layer_names = 1; repeated string input_layer_names = 2; @@ -68,7 +67,7 @@ message ConvConfig { required uint32 img_size = 8; // caffe mode for output size coherence - required bool caffe_mode = 9 [default = true]; + required bool caffe_mode = 9 [ default = true ]; // if filter_size_y is set , this convolutional layer will use // filters of size filter_size * filter_size_y pixels. @@ -99,7 +98,7 @@ message PoolConfig { optional uint32 start = 4; // Defines the stride size between successive pooling squares. - required uint32 stride = 5 [default = 1]; + required uint32 stride = 5 [ default = 1 ]; // The size of output feature map. required uint32 output_x = 6; @@ -109,7 +108,7 @@ message PoolConfig { // padding = 4, instructs the net to implicitly // pad the images with a 4-pixel border of zeros. - optional uint32 padding = 8 [default = 0]; + optional uint32 padding = 8 [ default = 0 ]; // if not set, use size_x optional uint32 size_y = 9; @@ -194,8 +193,11 @@ message MaxOutConfig { required uint32 groups = 2; } -message RowConvConfig { - required uint32 context_length = 1; +message RowConvConfig { required uint32 context_length = 1; } + +message SliceConfig { + required uint32 start = 1; + required uint32 end = 2; } message ProjectionConfig { @@ -207,17 +209,21 @@ message ProjectionConfig { // For ShiftProjection optional int32 context_start = 5; optional int32 context_length = 6; - optional bool trainable_padding = 7 [default = false]; + optional bool trainable_padding = 7 [ default = false ]; // For convolution optional ConvConfig conv_conf = 8; optional int32 num_filters = 9; // For IdentityOffsetProjection - optional uint64 offset = 11 [default = 0]; + optional uint64 offset = 11 [ default = 0 ]; // For pool optional PoolConfig pool_conf = 12; + + // For slice + // Each slice output is the input[start, end) + repeated SliceConfig slices = 13; } message OperatorConfig { @@ -227,7 +233,7 @@ message OperatorConfig { required uint64 output_size = 4; // For DotMulOperator - optional double dotmul_scale = 5 [default = 1.0]; + optional double dotmul_scale = 5 [ default = 1.0 ]; // For ConvOperator optional ConvConfig conv_conf = 6; @@ -273,8 +279,8 @@ message MultiBoxLossConfig { required float neg_overlap = 4; required uint32 background_id = 5; required uint32 input_num = 6; - optional uint32 height = 7 [default = 1]; - optional uint32 width = 8 [default = 1]; + optional uint32 height = 7 [ default = 1 ]; + optional uint32 width = 8 [ default = 1 ]; } message DetectionOutputConfig { @@ -285,8 +291,13 @@ message DetectionOutputConfig { required uint32 input_num = 5; required uint32 keep_top_k = 6; required float confidence_threshold = 7; - optional uint32 height = 8 [default = 1]; - optional uint32 width = 9 [default = 1]; + optional uint32 height = 8 [ default = 1 ]; + optional uint32 width = 9 [ default = 1 ]; +} + +message ClipConfig { + required double min = 1; + required double max = 2; } message LayerInputConfig { @@ -309,6 +320,7 @@ message LayerInputConfig { optional RowConvConfig row_conv_conf = 15; optional MultiBoxLossConfig multibox_loss_conf = 16; optional DetectionOutputConfig detection_output_conf = 17; + optional ClipConfig clip_conf = 18; } message LayerConfig { @@ -316,7 +328,7 @@ message LayerConfig { required string name = 1; required string type = 2; optional uint64 size = 3; - //optional ActivationConfig activation = 4; + // optional ActivationConfig activation = 4; optional string active_type = 4; repeated LayerInputConfig inputs = 5; optional string bias_parameter_name = 6; @@ -329,7 +341,7 @@ message LayerConfig { // (which is how convnets are usually trained). Setting this to // false will untie the biases, yielding a separate bias for // every location at which the filter is applied. - optional bool shared_biases = 8 [default = false]; + optional bool shared_biases = 8 [ default = false ]; // Valid values are ones that divide the area of the output // grid in this convolutional layer. For example if this layer @@ -347,33 +359,35 @@ message LayerConfig { // the gpu device which the Layer's data in. // Only used by ParallelNeuralNetork. Ignored otherwise. - optional int32 device = 12 [default = -1]; + optional int32 device = 12 [ default = -1 ]; - // for recurrent layer. If true, the recurrence runs from the end to the beginning. - optional bool reversed = 13 [default = false]; + // for recurrent layer. If true, the recurrence runs from the end to the + // beginning. + optional bool reversed = 13 [ default = false ]; - // for lstmemory layer. Different types of nodes have different activation type. - optional string active_gate_type = 14; + // for lstmemory layer. Different types of nodes have different activation + // type. + optional string active_gate_type = 14; optional string active_state_type = 15; // For NCELayer // The number of random negative labels for each sample - optional int32 num_neg_samples = 16 [default = 10]; + optional int32 num_neg_samples = 16 [ default = 10 ]; // For NCELayer // The distribution for generating the random negative labels. // A uniform distribution will be used if not provided - repeated double neg_sampling_dist = 17 [packed = true]; + repeated double neg_sampling_dist = 17 [ packed = true ]; // For MaxLayer // default: output VALUE of MaxLayer. set this flag to true for output INDEX // INDEX will be put in Argument::value as double values. - optional bool output_max_index = 19 [default = false]; + optional bool output_max_index = 19 [ default = false ]; /// The filed number 20 have been deprecated. // For self-normalized estimation - optional double softmax_selfnorm_alpha = 21 [default = 0.1]; + optional double softmax_selfnorm_alpha = 21 [ default = 0.1 ]; /// The filed numbers 22 and 23 have been deprecated. @@ -384,14 +398,14 @@ message LayerConfig { optional bool norm_by_times = 25; // for CostLayers - optional double coeff = 26 [default = 1.0]; + optional double coeff = 26 [ default = 1.0 ]; // for AverageLayer // can be set to: 'average', 'sum' or 'squarerootn' optional string average_strategy = 27; // for error clipping - optional double error_clipping_threshold = 28 [default = 0.0]; + optional double error_clipping_threshold = 28 [ default = 0.0 ]; // for operators used by mixed layer repeated OperatorConfig operator_confs = 29; @@ -419,43 +433,44 @@ message LayerConfig { optional uint32 beam_size = 39; // for seqlastins layer, whether select first instead last - optional bool select_first = 40 [default = false]; + optional bool select_first = 40 [ default = false ]; // for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer // can be set to: 'non-seq','seq' - optional string trans_type = 41 [default = 'non-seq']; + optional string trans_type = 41 [ default = 'non-seq' ]; // to indicate whether selective_fc layer // is used in sequence generation or not - optional bool selective_fc_pass_generation = 42 [default = false]; + optional bool selective_fc_pass_generation = 42 [ default = false ]; // to indicate whether selective_fc layer take its last input to // selected several columns and only compute the multiplications // between the input matrices and the selected columns of // the parameter matrices of this layer. // if set false, selective_fc degrades into fc. - optional bool has_selected_colums = 43 [default = true]; + optional bool has_selected_colums = 43 [ default = true ]; // this parameter is for speed consideration. // if number of the selected columns is less than // sample number * selective_fc output size * selective_fc_mull_mull_ratio // sparse multiplication is used, otherwise, using full multiplication. - optional double selective_fc_full_mul_ratio = 44 [default = 0.02]; + optional double selective_fc_full_mul_ratio = 44 [ default = 0.02 ]; // to indicate how many threads selective_fc use to to accelate // the plain_mul period // leave empty or set to 0 to disable multi-thread accleleration - optional uint32 selective_fc_parallel_plain_mul_thread_num = 45 [default = 0]; + optional uint32 selective_fc_parallel_plain_mul_thread_num = 45 + [ default = 0 ]; // for batch normalization layer // if set use_global_stats true, will use the loaded mean and variance. optional bool use_global_stats = 46; // use to compute moving mean and variance. - optional double moving_average_fraction = 47 [default = 0.9]; + optional double moving_average_fraction = 47 [ default = 0.9 ]; // bias size - optional uint32 bias_size = 48 [default = 0]; + optional uint32 bias_size = 48 [ default = 0 ]; // this parameter can be used as a user-defined parameter when necessary, // without changing the proto file. @@ -470,18 +485,17 @@ message LayerConfig { optional uint64 width = 51; // blank label used in ctc loss - optional uint32 blank = 52 [default = 0]; + optional uint32 blank = 52 [ default = 0 ]; // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which // controls the scope of pooling operation. can be set > 0. // leave empty or set to -1 to disable this stride pooling. - optional int32 seq_pool_stride = 53 [default = -1]; + optional int32 seq_pool_stride = 53 [ default = -1 ]; // for crop layer - optional int32 axis = 54 [default = 2]; + optional int32 axis = 54 [ default = 2 ]; repeated uint32 offset = 55; repeated uint32 shape = 56; - } message EvaluatorConfig { @@ -497,9 +511,9 @@ message EvaluatorConfig { // Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator // For multi binary labels: true if output > classification_threshold - optional double classification_threshold = 6 [default = 0.5]; + optional double classification_threshold = 6 [ default = 0.5 ]; // The positive label. -1 means average precision and recall - optional int32 positive_label = 7 [default = -1]; + optional int32 positive_label = 7 [ default = -1 ]; // load dict from this file optional string dict_file = 8; @@ -508,10 +522,10 @@ message EvaluatorConfig { optional string result_file = 9; // top # results for max id printer - optional int32 num_results = 10 [default = 1]; + optional int32 num_results = 10 [ default = 1 ]; // whether to delimit the sequence in the seq_text_printer - optional bool delimited = 11 [default = true]; + optional bool delimited = 11 [ default = true ]; // Used by ChunkEvaluator // chunk of these types are not counted @@ -519,23 +533,23 @@ message EvaluatorConfig { // Used by ClassificationErrorEvaluator // top # classification error - optional int32 top_k = 13 [default = 1]; + optional int32 top_k = 13 [ default = 1 ]; // Used by DetectionMAPEvaluator - optional double overlap_threshold = 14 [default = 0.5]; + optional double overlap_threshold = 14 [ default = 0.5 ]; - optional int32 background_id = 15 [default = 0]; + optional int32 background_id = 15 [ default = 0 ]; - optional bool evaluate_difficult = 16 [default = false]; + optional bool evaluate_difficult = 16 [ default = false ]; - optional string ap_type = 17 [default = "11point"]; + optional string ap_type = 17 [ default = "11point" ]; } message LinkConfig { required string layer_name = 1; required string link_name = 2; // If true, this link has sub-sequence - optional bool has_subseq = 3 [default = false]; + optional bool has_subseq = 3 [ default = false ]; } message MemoryConfig { @@ -548,18 +562,18 @@ message MemoryConfig { optional uint32 boot_with_const_id = 7; // memory is a sequence, initailized by a sequence boot layer - optional bool is_sequence = 6 [default = false]; + optional bool is_sequence = 6 [ default = false ]; } message GeneratorConfig { required uint32 max_num_frames = 1; required string eos_layer_name = 2; - optional int32 num_results_per_sample = 3 [default = 1]; + optional int32 num_results_per_sample = 3 [ default = 1 ]; // for beam search - optional int32 beam_size = 4 [default = 1]; + optional int32 beam_size = 4 [ default = 1 ]; - optional bool log_prob = 5 [default = true]; + optional bool log_prob = 5 [ default = true ]; } message SubModelConfig { @@ -569,10 +583,10 @@ message SubModelConfig { repeated string output_layer_names = 4; repeated string evaluator_names = 5; - optional bool is_recurrent_layer_group = 6 [default = false]; + optional bool is_recurrent_layer_group = 6 [ default = false ]; // If true, the recurrence runs from the end to the beginning. - optional bool reversed = 7 [default = false]; + optional bool reversed = 7 [ default = false ]; // name and link name of memory repeated MemoryConfig memories = 8; @@ -586,14 +600,15 @@ message SubModelConfig { optional GeneratorConfig generator = 11; - // the id of inlink which share info with outlinks, used in recurrent layer group + // the id of inlink which share info with outlinks, used in recurrent layer + // group optional int32 target_inlinkid = 12; } message ModelConfig { // type of the model. // Currently, "nn", "recurrent_nn" and "recursive_nn" are supported - required string type = 1 [default = "nn"]; + required string type = 1 [ default = "nn" ]; // layers should be ordered in such a way that the forward propagation // can be correctly executed by going from the first layer to the last layer diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto index 2a87e293f64d3398dea2641c3ff292eceec7e154..d27b1bcf80045216a5807812d39f7a248a956076 100644 --- a/proto/OptimizerConfig.proto +++ b/proto/OptimizerConfig.proto @@ -1,5 +1,5 @@ syntax = "proto2"; - + option optimize_for = LITE_RUNTIME; package paddle; @@ -9,13 +9,11 @@ message SGDConfig { // momentum: float >= 0. Parameter updates momentum. // decay: float >= 0. Learning rate decay over each update. // nesterov: boolean. Whether to apply Nesterov momentum. - optional double momentum = 21 [default = 0.0]; - optional double decay = 23 [default = 0.0]; - optional bool nesterov =24 [default = false]; - + optional double momentum = 21 [ default = 0.0 ]; + optional double decay = 23 [ default = 0.0 ]; + optional bool nesterov = 24 [ default = false ]; } - message AdadeltaConfig { // Adadelta // It is recommended to leave it at the default value. @@ -23,21 +21,23 @@ message AdadeltaConfig { // epsilon: float >= 0. Fuzz factor. // decay: float >= 0. Learning rate decay over each update. - // reference : [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701) - optional double rho = 33 [default = 0.90]; - optional double epsilon = 31 [default = 1e-5]; - optional double decay = 32 [default = 0.0]; - + // reference : [Adadelta - an adaptive learning rate + // method](http://arxiv.org/abs/1212.5701) + optional double rho = 33 [ default = 0.90 ]; + optional double epsilon = 31 [ default = 1e-5 ]; + optional double decay = 32 [ default = 0.0 ]; } message AdagradConfig { -// Adagrad -// epsilon: float >= 0. -// decay: float >= 0. Learning rate decay over each update. + // Adagrad + // epsilon: float >= 0. + // decay: float >= 0. Learning rate decay over each update. -// reference : [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) - optional double epsilon = 41 [default = 1e-5]; - optional double decay = 42 [default = 0.0]; + // reference : [Adaptive Subgradient Methods for Online Learning and + // Stochastic + // Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) + optional double epsilon = 41 [ default = 1e-5 ]; + optional double decay = 42 [ default = 0.0 ]; } message AdamConfig { @@ -46,7 +46,8 @@ message AdamConfig { // beta_2: float, 0 < beta < 1. Generally close to 1. // epsilon: float >= 0. Fuzz factor. // decay: float >= 0. Learning rate decay over each update. - // reference : [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) + // reference : [Adam - A Method for Stochastic + // Optimization](http://arxiv.org/abs/1412.6980v8) optional double beta_1 = 41; optional double beta_2 = 42; optional double epsilon = 43; @@ -55,32 +56,32 @@ message AdamConfig { message ConstLrConfig { // learninRate Policy - optional double learning_rate = 1 [default = 1.0]; + optional double learning_rate = 1 [ default = 1.0 ]; } message LinearLrConfig { // learninRate Policy - optional double learning_rate = 1 [default = 1.0]; + optional double learning_rate = 1 [ default = 1.0 ]; optional double lr_decay_a = 2; optional double lr_decay_b = 3; } message TensorProto { -enum DataType { - PADDLE_ELEMENT_TYPE_INT32 = 0; - PADDLE_ELEMENT_TYPE_UINT32 = 1; - PADDLE_ELEMENT_TYPE_INT64 = 2; - PADDLE_ELEMENT_TYPE_UINT64 = 3; - PADDLE_ELEMENT_TYPE_FLOAT32 = 4; - PADDLE_ELEMENT_TYPE_FLOAT64 = 5; -} + enum DataType { + PADDLE_ELEMENT_TYPE_INT32 = 0; + PADDLE_ELEMENT_TYPE_UINT32 = 1; + PADDLE_ELEMENT_TYPE_INT64 = 2; + PADDLE_ELEMENT_TYPE_UINT64 = 3; + PADDLE_ELEMENT_TYPE_FLOAT32 = 4; + PADDLE_ELEMENT_TYPE_FLOAT64 = 5; + } optional DataType data_type = 1; repeated bytes content = 2; } message LrPolicyState { // learninRate Policy - optional double learning_rate = 1 [default = 1.0]; + optional double learning_rate = 1 [ default = 1.0 ]; optional double lr_decay_a = 2; optional double lr_decay_b = 3; } @@ -104,7 +105,6 @@ message AdadeltaOptimizerState { optional TensorProto update_delta = 4; } - message AdagradOptimizerState { optional LrPolicyState lr_state = 101; optional double num_sample_passed = 104; @@ -124,10 +124,10 @@ message AdamOptimizerState { message OptimizerConfig { enum Optimizer { - SGD = 1; - Adadelta = 2; - Adagrad = 3; - Adam = 4; + SGD = 1; + Adadelta = 2; + Adagrad = 3; + Adam = 4; } optional Optimizer optimizer = 1; optional SGDConfig sgd = 3; @@ -136,8 +136,8 @@ message OptimizerConfig { optional AdamConfig adam = 6; enum LrPolicy { - Const = 0; - Linear = 1; + Const = 0; + Linear = 1; } optional LrPolicy lr_policy = 11; optional ConstLrConfig const_lr = 12; diff --git a/proto/ParameterConfig.proto b/proto/ParameterConfig.proto index 580d66324602df4c655dd2f1e1cd87159b5b346b..b13570a2c6e7b16e45892a31bb496a9dd2099df0 100644 --- a/proto/ParameterConfig.proto +++ b/proto/ParameterConfig.proto @@ -27,56 +27,57 @@ enum ParameterInitStrategy { message ParameterUpdaterHookConfig { // hook type such as 'pruning' required string type = 1; - // this represents the ratio of zero element to be set by the Parameter - optional double sparsity_ratio = 2 [default = 0.6]; + // this represents the ratio of zero element to be set by the Parameter + optional double sparsity_ratio = 2 [ default = 0.6 ]; } message ParameterConfig { required string name = 1; required uint64 size = 2; - optional double learning_rate = 3 [default = 1.0]; - optional double momentum = 4 [default = 0.0]; - optional double initial_mean = 5 [default = 0.0]; - optional double initial_std = 6 [default = 0.01]; + optional double learning_rate = 3 [ default = 1.0 ]; + optional double momentum = 4 [ default = 0.0 ]; + optional double initial_mean = 5 [ default = 0.0 ]; + optional double initial_std = 6 [ default = 0.01 ]; // use L2-regularization if decay_rate set and decay_rate_l1 not set - optional double decay_rate = 7 [default = 0.0]; + optional double decay_rate = 7 [ default = 0.0 ]; // use L1-regularization if decay_rate_l1 set - optional double decay_rate_l1 = 8 [default = 0.0]; + optional double decay_rate_l1 = 8 [ default = 0.0 ]; // dims of Parameter, e.g. dims[0] as height, dims[1] as width.. repeated uint64 dims = 9; // the gpu device which the parameter in. // Only used by ParallelNeuralNetork. Ignored otherwise. - optional int32 device = 10 [default = -1]; + optional int32 device = 10 [ default = -1 ]; // how to init the parameter: 0 -> normal, 1 -> uniform // 0: treat initial_mean as mean, intial_std as standard deviation // 1: range is (initial_mean - initial_std) to (initial_mean + initial_std) - optional int32 initial_strategy = 11 [default = 0]; + optional int32 initial_strategy = 11 [ default = 0 ]; // define the variance when init the parameter, by height of the Matrix - optional bool initial_smart = 12 [default = false]; + optional bool initial_smart = 12 [ default = false ]; // apply regularization every # batches - optional int32 num_batches_regularization = 13 [default = 1]; + optional int32 num_batches_regularization = 13 [ default = 1 ]; // if is_sparse is true, para is sparse, else para is dense - optional bool is_sparse = 14[default = false]; - // if para is sparse, format should be "csc" or "csr", empty means is not sparse - optional string format = 15 [default = ""]; + optional bool is_sparse = 14 [ default = false ]; + // if para is sparse, format should be "csc" or "csr", empty means is not + // sparse + optional string format = 15 [ default = "" ]; // sparse remote update or not - optional bool sparse_remote_update = 16 [default = false]; + optional bool sparse_remote_update = 16 [ default = false ]; // gradient clipping threshold, no clipping by default - optional double gradient_clipping_threshold = 17 [default = 0.0]; + optional double gradient_clipping_threshold = 17 [ default = 0.0 ]; // static parameters are fixed when training - optional bool is_static = 18 [default = false]; + optional bool is_static = 18 [ default = false ]; // para_id should NOT be set by config_parser. It is for // internal use. optional uint64 para_id = 19; repeated ParameterUpdaterHookConfig update_hooks = 20; // setup load mat -> csr - optional bool need_compact = 21 [default = false]; + optional bool need_compact = 21 [ default = false ]; // whether to do sparse update for this parameter - optional bool sparse_update = 22 [default = false]; + optional bool sparse_update = 22 [ default = false ]; // whether this parameter is shared or not. - optional bool is_shared = 23 [default = false]; + optional bool is_shared = 23 [ default = false ]; // parameter block size - optional uint64 parameter_block_size = 24 [default = 0]; + optional uint64 parameter_block_size = 24 [ default = 0 ]; } diff --git a/proto/ParameterServerConfig.proto b/proto/ParameterServerConfig.proto index 404f9613792653dda72eeb98f022851adedbfbfd..bd63cf35b1483a45f21de6f0d0d883e4d8432296 100644 --- a/proto/ParameterServerConfig.proto +++ b/proto/ParameterServerConfig.proto @@ -15,13 +15,10 @@ syntax = "proto2"; package paddle; - /** * Configuration structure for ParameterClient2. */ -message ParameterClientConfig { - required int32 trainer_id = 1; -} +message ParameterClientConfig { required int32 trainer_id = 1; } /** * Configuration structure for ParameterServer2. @@ -30,24 +27,24 @@ message ParameterServerConfig { // Number of ports for sending dense parameter, // following ports on parameter server will be visited // for sending dense parameter: [port, port+ports_num-1] - required int32 ports_num = 1 [default = 1]; + required int32 ports_num = 1 [ default = 1 ]; // Number of ports for sending sparse parameter, // following ports on parameter server will be visited // for sending sparse parameter: // [port+ports_num, port+ports_num+ports_num_for_sparse-1] - required int32 ports_num_for_sparse = 2 [default = 0]; + required int32 ports_num_for_sparse = 2 [ default = 0 ]; // network device name for pservers - required string nics = 3 [default = "xgbe0,xgbe1"]; - required string rdma_tcp = 4 [default = "tcp"]; + required string nics = 3 [ default = "xgbe0,xgbe1" ]; + required string rdma_tcp = 4 [ default = "tcp" ]; // Listening port for pserver - required int32 port = 5 [default = 20134]; + required int32 port = 5 [ default = 20134 ]; // number of gradient servers - required int32 num_gradient_servers = 6 [default = 1]; + required int32 num_gradient_servers = 6 [ default = 1 ]; // number of threads for sync op exec - required int32 pserver_num_threads = 7 [default = 1]; + required int32 pserver_num_threads = 7 [ default = 1 ]; // control config_.async_lagged_grad_discard_ratio() min value - required double async_lagged_ratio_min = 8 [default = 1.0]; + required double async_lagged_ratio_min = 8 [ default = 1.0 ]; // if async_lagged_grad_discard_ratio is not set in trainer_config.conf // use it as defalut value - required double async_lagged_ratio_default = 9 [default = 1.5]; + required double async_lagged_ratio_default = 9 [ default = 1.5 ]; } \ No newline at end of file diff --git a/proto/ParameterService.proto b/proto/ParameterService.proto index c1c04d8cc5bdedd09173d5dfa10b82c7ee7ed6a4..e3c180ccc3f2a9bfa13c443944cc5ae3398818a9 100644 --- a/proto/ParameterService.proto +++ b/proto/ParameterService.proto @@ -23,8 +23,8 @@ package paddle; */ enum ParameterUpdateMode { // Set parameter - PSERVER_UPDATE_MODE_SET_PARAM = 0;//use local param - PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1;//set zero param + PSERVER_UPDATE_MODE_SET_PARAM = 0; // use local param + PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1; // set zero param // Update parameter once a gradient is received PSERVER_UPDATE_MODE_ASYNC_SGD = 2; @@ -37,7 +37,7 @@ enum ParameterUpdateMode { // No update. Only get parameters back. PSERVER_UPDATE_MODE_GET_PARAM = 5; - PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6;//only get sparse rows + PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6; // only get sparse rows }; message ParameterBlock { @@ -80,42 +80,34 @@ message SendParameterRequest { optional int32 trainer_id = 7; // send back parameter type on pserver, PARAMETER_VALUE by default - optional int32 send_back_parameter_type = 8 [default = 0]; + optional int32 send_back_parameter_type = 8 [ default = 0 ]; // forwardbackward time in usec optional uint64 forwardbackward_time = 9; - } -message WaitPassStartRequest { -} +message WaitPassStartRequest {} -message WaitPassStartResponse { -} +message WaitPassStartResponse {} -message WaitPassFinishRequest { -} +message WaitPassFinishRequest {} -message WaitPassFinishResponse { -} +message WaitPassFinishResponse {} enum SyncObject { SYNC_DEFAULT = 0; // wait for the synchronizeBarrier_ - SYNC_DATA = 1; // wait for the synchronizeDataBarrier_ + SYNC_DATA = 1; // wait for the synchronizeDataBarrier_ } message SynchronizeRequest { - required SyncObject sync_object_id = 1 [default = SYNC_DEFAULT]; + required SyncObject sync_object_id = 1 [ default = SYNC_DEFAULT ]; optional int32 trainer_id = 2; } -message SynchronizeResponse { -} +message SynchronizeResponse {} -message SendParameterResponse { - repeated ParameterBlock blocks = 1; -} +message SendParameterResponse { repeated ParameterBlock blocks = 1; } message SetConfigRequest { repeated ParameterConfig param_configs = 1; @@ -125,26 +117,18 @@ message SetConfigRequest { required bool is_sparse_server = 6; } -message SetConfigResponse{ -} +message SetConfigResponse {} -message GetStatusRequest { -} +message GetStatusRequest {} -message GetStatusResponse { - required PServerStatus status = 1; -} +message GetStatusResponse { required PServerStatus status = 1; } -message SetStatusRequest { - required PServerStatus status = 1; -} +message SetStatusRequest { required PServerStatus status = 1; } -message SetStatusResponse { -} +message SetStatusResponse {} // create a column vector. The size is the dimension of parameter -message CreateVectorRequest { -} +message CreateVectorRequest {} message CreateVectorResponse { // error message. Empty if success @@ -153,9 +137,7 @@ message CreateVectorResponse { required int64 handle = 2; } -message ReleaseVectorRequest { - required int64 handle = 1; -} +message ReleaseVectorRequest { required int64 handle = 1; } message ReleaseVectorResponse { // error message. Empty if success @@ -164,9 +146,7 @@ message ReleaseVectorResponse { // Create a column major matrix. The number of rows is the dimension // of parameter. The number of columns is specifed by num_cols -message CreateMatrixRequest { - required int32 num_cols = 1; -} +message CreateMatrixRequest { required int32 num_cols = 1; } message CreateMatrixResponse { // error message. Empty if success @@ -175,16 +155,13 @@ message CreateMatrixResponse { required int64 handle = 2; } -message ReleaseMatrixRequest { - required int64 handle = 1; -} +message ReleaseMatrixRequest { required int64 handle = 1; } message ReleaseMatrixResponse { // error message. Empty if success optional string return_message = 1; } - /** * The operations are defined using the variables commented at Operation * and OperationResult @@ -245,36 +222,36 @@ enum MatrixVectorOperation { message ProtoVector { required int64 dim = 1; - repeated double values = 2 [packed = true]; + repeated double values = 2 [ packed = true ]; } message ProtoMatrix { required int64 num_rows = 1; required int64 num_cols = 2; - repeated double values = 3 [packed = true]; + repeated double values = 3 [ packed = true ]; } message Operation { required MatrixVectorOperation operation = 1; // vector handles created on the pserver - repeated int64 pvectors = 2; // u, v, w + repeated int64 pvectors = 2; // u, v, w // matrix handles created on the pserver - repeated int64 pmatrices = 3; // A, B, C + repeated int64 pmatrices = 3; // A, B, C - repeated double scalars = 4; // a, b, c - repeated ProtoVector vectors = 5; // x, y, z - repeated ProtoMatrix matrices = 6; // X, Y, Z + repeated double scalars = 4; // a, b, c + repeated ProtoVector vectors = 5; // x, y, z + repeated ProtoMatrix matrices = 6; // X, Y, Z } message OperationResult { // error message. Empty if success optional string return_message = 1; -// - repeated double scalars = 2; // d, e, f + // + repeated double scalars = 2; // d, e, f repeated ProtoVector vectors = 3; // p, q, r - repeated ProtoMatrix matrices = 4; // P, Q, R + repeated ProtoMatrix matrices = 4; // P, Q, R } message DoOperationRequest { @@ -301,18 +278,14 @@ message DoOperationResponse { required bool pass_finish = 3; } -message LoadValueRequest { - required string dir_name = 1; -} +message LoadValueRequest { required string dir_name = 1; } message LoadValueResponse { // error message. Empty if success optional string return_message = 1; } -message SaveValueRequest { - required string dir_name = 1; -} +message SaveValueRequest { required string dir_name = 1; } message SaveValueResponse { // error message. Empty if success @@ -331,11 +304,11 @@ enum DataUpdateMode { // Client send it's own ref label to pserver DATA_UPDATE_MODE_SET_REF_LABEL = 4; // Client get all ref labels from all pservers - DATA_UPDATE_MODE_GET_REF_LABEL =5; + DATA_UPDATE_MODE_GET_REF_LABEL = 5; // Client send it's own ref grad to pserver - DATA_UPDATE_MODE_SET_REF_GRAD =6; + DATA_UPDATE_MODE_SET_REF_GRAD = 6; // Client get all ref grad from all pservers - DATA_UPDATE_MODE_GET_REF_GRAD =7; + DATA_UPDATE_MODE_GET_REF_GRAD = 7; } enum SendDataType { @@ -360,7 +333,7 @@ message DataBlock { // byte size of one data type required int32 data_size = 2; // data_type - optional TransDataType data_type = 3 [default = TRANS_DOUBLE]; + optional TransDataType data_type = 3 [ default = TRANS_DOUBLE ]; } message SendDataRequest { diff --git a/proto/TrainerConfig.proto b/proto/TrainerConfig.proto index a819d20d11ff3932d331801007b8cfb9c77a3f2b..b7c2355159e66be0a1550d3c8fde9a15346ff7e4 100644 --- a/proto/TrainerConfig.proto +++ b/proto/TrainerConfig.proto @@ -20,14 +20,14 @@ package paddle; message OptimizationConfig { required int32 batch_size = 3; - required string algorithm = 4 [default = "async_sgd"]; - optional int32 num_batches_per_send_parameter = 5 [default = 1]; - optional int32 num_batches_per_get_parameter = 6 [default = 1]; + required string algorithm = 4 [ default = "async_sgd" ]; + optional int32 num_batches_per_send_parameter = 5 [ default = 1 ]; + optional int32 num_batches_per_get_parameter = 6 [ default = 1 ]; required double learning_rate = 7; - optional double learning_rate_decay_a = 8 [default = 0]; - optional double learning_rate_decay_b = 9 [default = 0]; - optional string learning_rate_schedule = 27 [default = "constant"]; + optional double learning_rate_decay_a = 8 [ default = 0 ]; + optional double learning_rate_decay_b = 9 [ default = 0 ]; + optional string learning_rate_schedule = 27 [ default = "constant" ]; // learning rate will be scaled according to learning_rate_schedule // 1), constant: // lr = learning_rate @@ -49,88 +49,92 @@ message OptimizationConfig { // owlqn related // L1-regularization - optional double l1weight = 10 [default = 0.1]; + optional double l1weight = 10 [ default = 0.1 ]; // L2-regularization - optional double l2weight = 11 [default = 0]; + optional double l2weight = 11 [ default = 0 ]; // "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step) // then accept the step - optional double c1 = 12 [default = 0.0001]; + optional double c1 = 12 [ default = 0.0001 ]; // multiply the step with "backoff", when wolfe condition doesn't satisfy - optional double backoff = 13 [default = 0.5]; + optional double backoff = 13 [ default = 0.5 ]; // how many "s"s and "y"s are kept in owlqn - optional int32 owlqn_steps = 14 [default = 10]; + optional int32 owlqn_steps = 14 [ default = 10 ]; // accept the step if encountered "max_backoff" times of "reduce the step" - optional int32 max_backoff = 15 [default = 5]; + optional int32 max_backoff = 15 [ default = 5 ]; // L2-regularization coefficient is reduced linearly from iteration 0 to // "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter" // iterations. set "l2weight_zero_iter" to 0 to disable this strategy. - optional int32 l2weight_zero_iter = 17 [default = 0]; + optional int32 l2weight_zero_iter = 17 [ default = 0 ]; // averaged sgd // About average_window * numBatchProcessed parameter are used // for average. To be accurate, between average_window * numBatchProcessed // and 2 * average_window * numBatchProcessed parameters are used for // average. - optional double average_window = 18 [default = 0]; - optional int64 max_average_window = 19 [default = 0x7fffffffffffffff]; + optional double average_window = 18 [ default = 0 ]; + optional int64 max_average_window = 19 [ default = 0x7fffffffffffffff ]; ////////////////////////// // Options Adaptive SGD // ////////////////////////// - // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta", "rmsprop" - // default learning method("momentum") use global decayed learning rate with momentum. + // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta", + // "rmsprop" + // default learning method("momentum") use global decayed learning rate with + // momentum. // "adagrad", "adadelta" and "rmsprop" can set momentum too. - optional string learning_method = 23 [default = "momentum"]; - optional double ada_epsilon = 24 [default = 1e-6]; - optional double ada_rou = 26 [default = 0.95]; + optional string learning_method = 23 [ default = "momentum" ]; + optional double ada_epsilon = 24 [ default = 1e-6 ]; + optional double ada_rou = 26 [ default = 0.95 ]; // Force to do average in cpu in order to save gpu memory usage - optional bool do_average_in_cpu = 25 [default = false]; + optional bool do_average_in_cpu = 25 [ default = false ]; // delta add rate in pserver, used while num_batches_per_send_parameter>1 // will be divided by #machines automatically. - optional double delta_add_rate = 28 [default = 1.0]; + optional double delta_add_rate = 28 [ default = 1.0 ]; // We split a large size into smaller mini-batches, whose sizes are // determined by mini_batch_size. It only takes effect when there is // an ExternalMachine. - optional int32 mini_batch_size = 29 [default = 128]; + optional int32 mini_batch_size = 29 [ default = 128 ]; // automatically set if any one of parameters set sparse remote update flag - optional bool use_sparse_remote_updater = 30 [default = false]; + optional bool use_sparse_remote_updater = 30 [ default = false ]; - // how to update center parameter and feedback to local parameter, + // how to update center parameter and feedback to local parameter, // when use local sgd update in cluster training. - // A option is elastic_average, proposed by the paper: Deep learning with elastic averaging SGD. - // If use elastic_average method, every trainer node should sample from whole data sets. - optional string center_parameter_update_method = 31 [default = "average"]; + // A option is elastic_average, proposed by the paper: Deep learning with + // elastic averaging SGD. + // If use elastic_average method, every trainer node should sample from whole + // data sets. + optional string center_parameter_update_method = 31 [ default = "average" ]; // shrink sparse parameter value // only works if parameter is remote sparse update and has L1 decay rate - optional double shrink_parameter_value = 32 [default = 0]; + optional double shrink_parameter_value = 32 [ default = 0 ]; //////////////////////////// // Options Adam Optimizer // //////////////////////////// - optional double adam_beta1 = 33 [default = 0.9]; - optional double adam_beta2 = 34 [default = 0.999]; - optional double adam_epsilon = 35 [default = 1e-8]; + optional double adam_beta1 = 33 [ default = 0.9 ]; + optional double adam_beta2 = 34 [ default = 0.999 ]; + optional double adam_epsilon = 35 [ default = 1e-8 ]; // arguments for learning rate scheduler // Format: num1:rate1,num2:rate2,...,numK:rateK // For learning_rate_schedule="manual", num is the number of samples, // For learning_rate_schedule="pass_manual", // num is the number of passes (starting from 0) - optional string learning_rate_args = 36 [default = ""]; - + optional string learning_rate_args = 36 [ default = "" ]; + // for async sgd gradient commit control. // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed, // current async gradient will be discard silently. - optional double async_lagged_grad_discard_ratio = 37 [default = 1.5]; + optional double async_lagged_grad_discard_ratio = 37 [ default = 1.5 ]; - // global threshold for gradient clipping - optional double gradient_clipping_threshold = 38 [default = 0.0]; + // global threshold for gradient clipping + optional double gradient_clipping_threshold = 38 [ default = 0.0 ]; }; message TrainerConfig { @@ -141,7 +145,7 @@ message TrainerConfig { repeated string config_files = 5; // the directory to save/load model files for each training path - optional string save_dir = 6 [default = "./output/model"]; + optional string save_dir = 6 [ default = "./output/model" ]; // Path of the initial model parameters. // If it was set, start_pass will be ignored. @@ -149,7 +153,7 @@ message TrainerConfig { // Start training from this pass. // Will load parameter from the previous pass. - optional int32 start_pass = 8 [default = 0]; + optional int32 start_pass = 8 [ default = 0 ]; // file path to the trainer config file optional string config_file = 9; diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 5477158ecb8646992ebdded0b15cce50720ebf36..9ea69fc5e57636c22fb20d5d97de760b9cc3bcde 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -565,6 +565,35 @@ class IdentityOffsetProjection(Projection): return [] +@config_class +class SliceProjection(Projection): + type = 'slice' + + def __init__(self, input_layer_name, slices, **xargs): + super(SliceProjection, self).__init__(input_layer_name, **xargs) + input = g_layer_map[input_layer_name] + if input.type in ["exconv", "cudnn_conv"]: + # the slice operator is for the channel dimension + assert input.num_filters is not None + channels = input.num_filters + image_size = input.size / channels + assert slices[len(slices) - 1][1] <= channels + for i in xrange(len(slices)): + slice = self.proj_conf.slices.add() + slice.start = slices[i][0] * image_size + slice.end = slices[i][1] * image_size + self.size += slice.end - slice.start + else: + config_assert(False, + 'Currently the input should be convolution layer') + + def calc_parameter_size(self, input_size, output_size): + return 0 + + def calc_parameter_dims(self, input_size, output_size): + return [] + + # DotMulProjection performs element-wise multiplication with weight @config_class class DotMulProjection(Projection): @@ -2169,6 +2198,20 @@ class RowConvLayer(LayerBase): self.create_input_parameter(0, psize, dims) +@config_layer('clip') +class ClipLayer(LayerBase): + def __init__(self, name, inputs, min, max, **xargs): + super(ClipLayer, self).__init__(name, 'clip', 0, inputs=inputs, **xargs) + config_assert( + len(self.inputs) == 1, + 'ClipLayer must have one and only one input.') + config_assert(min < max, 'min must be less than max.') + input_layer = self.get_input_layer(0) + self.set_layer_size(input_layer.size) + self.config.inputs[0].clip_conf.min = min + self.config.inputs[0].clip_conf.max = max + + # key: cost type # value: cost class g_cost_map = {} @@ -2725,6 +2768,16 @@ class SumToOneNormLayer(LayerBase): self.set_layer_size(input_layer0.size) +@config_layer('row_l2_norm') +class RowL2NormLayer(LayerBase): + def __init__(self, name, inputs, **xargs): + super(RowL2NormLayer, self).__init__( + name, 'row_l2_norm', 0, inputs=inputs, **xargs) + config_assert(len(self.inputs) == 1, 'RowL2NormLayer must have 1 input') + input_layer = self.get_input_layer(0) + self.set_layer_size(input_layer.size) + + @config_layer('cos_vm') class CosSimVecMatLayer(LayerBase): def __init__(self, name, size, inputs, cos_scale=1.0, device=None): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 14f072fc55109d770edf469ad7c574b8dda8a434..ea5fdcc50f6abbc67fb61b7fd56c100d9f9811d0 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -76,6 +76,7 @@ __all__ = [ 'trans_layer', 'rotate_layer', 'sum_to_one_norm_layer', + 'row_l2_norm_layer', 'get_output_layer', 'LayerType', 'context_projection', @@ -128,6 +129,8 @@ __all__ = [ 'prelu_layer', 'gated_unit_layer', 'crop_layer', + 'clip_layer', + 'slice_projection', ] @@ -159,6 +162,7 @@ class LayerType(object): BATCH_NORM_LAYER = 'batch_norm' NORM_LAYER = 'norm' SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm' + ROW_L2_NORM_LAYER = 'row_l2_norm' ADDTO_LAYER = 'addto' CONCAT_LAYER = 'concat' @@ -220,6 +224,7 @@ class LayerType(object): PRELU = 'prelu' CROP_LAYER = 'crop' + CLIP_LAYER = 'clip' @staticmethod def is_layer_type(type_name): @@ -536,6 +541,45 @@ def identity_projection(input, offset=None, size=None): return proj +def slice_projection(input, slices): + """ + slice_projection can slice the input value into multiple parts, + and then select some of them to merge into a new output. + + .. math:: + output = [input.slices()] + + The example usage is: + + .. code-block:: python + + proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)]) + + Note that slice_projection should not have any parameter. + + :param input: Input Layer. + :type input: LayerOutput + :param slices: An array of slice parameters. + Each slice contains the start and end offsets based + on the input. + :type slices: pair of int + :return: A SliceProjection object + :rtype: SliceProjection + """ + assert len(slices) >= 1 + start = 0 + for i in xrange(len(slices)): + assert len(slices[i]) == 2 + # The start position of the next slice needs to be greater than + # or equal to the end position of the previous slice. + assert slices[i][0] >= start + assert slices[i][1] >= slices[i][0] + start = slices[i][1] + proj = SliceProjection(input_layer_name=input.name, slices=slices) + proj.origin = input + return proj + + @wrap_param_attr_default() def scaling_projection(input, param_attr=None): """ @@ -2849,6 +2893,42 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None): name, LayerType.SUM_TO_ONE_NORM_LAYER, parents=[input], size=input.size) +@wrap_name_default() +@layer_support() +def row_l2_norm_layer(input, name=None, layer_attr=None): + """ + A layer for L2-normalization in each row. + + .. math:: + out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}} + + where the size of :math:`in` is (batchSize x dataDim) , + and the size of :math:`out` is a (batchSize x dataDim) . + + The example usage is: + + .. code-block:: python + + row_l2_norm_layer = row_l2_norm_layer(input=layer) + + :param input: Input layer. + :type input: LayerOutput + :param name: Layer name. + :type name: basestring + :param layer_attr: extra layer attributes. + :type layer_attr: ExtraLayerAttribute. + :return: LayerOutput object. + :rtype: LayerOutput + """ + Layer( + name=name, + type=LayerType.ROW_L2_NORM_LAYER, + inputs=[input.name], + **ExtraAttr.to_kwargs(layer_attr)) + return LayerOutput( + name, LayerType.ROW_L2_NORM_LAYER, parents=[input], size=input.size) + + @wrap_name_default("addto") @wrap_act_default(act=LinearActivation()) @wrap_bias_attr_default(has_bias=False) @@ -6006,3 +6086,36 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None): layer_type=LayerType.CROP_LAYER, parents=input, size=l.config.size) + + +@wrap_name_default("clip") +def clip_layer(input, min, max, name=None): + """ + A layer for clipping the input value by the threshold. + + .. math:: + + out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right) + + .. code-block:: python + + clip = clip_layer(input=input_layer, min=-10, max=10) + + :param name: The Layer Name. + :type name: basestring + :param input: The input layer. + :type input: LayerOutput. + :param min: The lower threshold for clipping. + :type min: double + :param max: The upper threshold for clipping. + :type max: double + :return: LayerOutput + """ + Layer( + name=name, + type=LayerType.CLIP_LAYER, + inputs=[input.name], + min=min, + max=max) + return LayerOutput( + name, LayerType.CLIP_LAYER, parents=[input], size=input.size) diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh index cdf9b2eab733adb173cf33cd6a93ef7b5abefc50..0ffa58bc1e2088f75e7cd25c7ecdffbe270825a4 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh @@ -7,6 +7,6 @@ test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer -test_recursive_topology test_gated_unit_layer) +test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer) export whole_configs=(test_split_datasource) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr new file mode 100644 index 0000000000000000000000000000000000000000..4b9578a0c050ef74f186485fec3f6c1f7a0f0814 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr @@ -0,0 +1,31 @@ +type: "nn" +layers { + name: "input" + type: "data" + size: 300 + active_type: "" +} +layers { + name: "__clip_0__" + type: "clip" + size: 300 + active_type: "" + inputs { + input_layer_name: "input" + clip_conf { + min: -10 + max: 10 + } + } +} +input_layer_names: "input" +output_layer_names: "__clip_0__" +sub_models { + name: "root" + layer_names: "input" + layer_names: "__clip_0__" + input_layer_names: "input" + output_layer_names: "__clip_0__" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr new file mode 100644 index 0000000000000000000000000000000000000000..c2786ff55c7023d856d739face5e747cc5fee870 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr @@ -0,0 +1,27 @@ +type: "nn" +layers { + name: "input" + type: "data" + size: 300 + active_type: "" +} +layers { + name: "__row_l2_norm_layer_0__" + type: "row_l2_norm" + size: 300 + active_type: "" + inputs { + input_layer_name: "input" + } +} +input_layer_names: "input" +output_layer_names: "__row_l2_norm_layer_0__" +sub_models { + name: "root" + layer_names: "input" + layer_names: "__row_l2_norm_layer_0__" + input_layer_names: "input" + output_layer_names: "__row_l2_norm_layer_0__" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..f066fe1fb30877bf40bb6299d35546f7427989a5 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py @@ -0,0 +1,6 @@ +from paddle.trainer_config_helpers import * + +data = data_layer(name='input', size=300) +clip = clip_layer(input=data, min=-10, max=10) + +outputs(clip) diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..ac8badb26a40e96e75225e6f61aa536cd28e9098 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py @@ -0,0 +1,6 @@ +from paddle.trainer_config_helpers import * + +data = data_layer(name='input', size=300) +row_l2_norm = row_l2_norm_layer(input=data) + +outputs(row_l2_norm) diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index f885b2834e8ad502b752c6fd53daf7ef1693433f..0a2a1ced11ee5cb2fb407b229ce810d553c2fa46 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -133,7 +133,7 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train100(), 10, "cifar_train100") - paddle.v2.dataset.common.convert(path, test100(), 10, "cifar_test100") - paddle.v2.dataset.common.convert(path, train10(), 10, "cifar_train10") - paddle.v2.dataset.common.convert(path, test10(), 10, "cifar_test10") + paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100") + paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100") + paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10") + paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10") diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index 111496618dfa997246d0a067b0cd4c7dad74f9dc..053ae151c571e5557c9f2f9f4ec866f546a77797 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -32,17 +32,22 @@ __all__ = [ DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset') + # When running unit tests, there could be multiple processes that # trying to create DATA_HOME directory simultaneously, so we cannot # use a if condition to check for the existence of the directory; # instead, we use the filesystem as the synchronization mechanism by # catching returned errors. -try: - os.makedirs(DATA_HOME) -except OSError as exc: - if exc.errno != errno.EEXIST: - raise - pass +def must_mkdirs(path): + try: + os.makedirs(DATA_HOME) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise + pass + + +must_mkdirs(DATA_HOME) def md5file(fname): @@ -93,6 +98,19 @@ def fetch_all(): "fetch")() +def fetch_all_recordio(path): + for module_name in filter(lambda x: not x.startswith("__"), + dir(paddle.v2.dataset)): + if "convert" in dir( + importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \ + not module_name == "common": + ds_path = os.path.join(path, module_name) + must_mkdirs(ds_path) + getattr( + importlib.import_module("paddle.v2.dataset.%s" % module_name), + "convert")(ds_path) + + def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump): """ you can call the function as: diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index f8aae52e7c29d86c7da9c1da0dd1d093634d4567..23f5a24a1cea7f665fb65e802e1a7811df78208d 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -233,5 +233,5 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, test(), 10, "conl105_train") - paddle.v2.dataset.common.convert(path, test(), 10, "conl105_test") + paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test") diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index c0ec5992e0e6b0a2fd2359910d0f7a6c690c2ec3..93dd3e8f7d3a569eaf56335f0f92bed04c0ee26c 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -173,5 +173,5 @@ def convert(path): Converts dataset to recordio format """ w = word_dict() - paddle.v2.dataset.common.convert(path, lambda: train(w), 10, "imdb_train") - paddle.v2.dataset.common.convert(path, lambda: test(w), 10, "imdb_test") + paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train") + paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test") diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index b18ee8e9ba91e0e8ccf061223b3c0d4636442956..617c722c4165cdfed9e650fc968d623ef6ed4391 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -155,6 +155,7 @@ def convert(path): N = 5 word_dict = build_dict() paddle.v2.dataset.common.convert(path, - train(word_dict, N), 10, "imikolov_train") + train(word_dict, N), 1000, + "imikolov_train") paddle.v2.dataset.common.convert(path, - test(word_dict, N), 10, "imikolov_test") + test(word_dict, N), 1000, "imikolov_test") diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py index ea5891f4f3f6ee1c5023cccee9732cbd9d78b881..9f675bed895223e054cd3bb6e504fe1607f19858 100644 --- a/python/paddle/v2/dataset/mnist.py +++ b/python/paddle/v2/dataset/mnist.py @@ -119,5 +119,5 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train(), 10, "minist_train") - paddle.v2.dataset.common.convert(path, test(), 10, "minist_test") + paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test") diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index d9372d422a3293eddeb7c0d5b7c8980f55c44690..5b61a9420af1bb81e1d826f8a7b69f34c306d382 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -254,8 +254,8 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train(), 10, "movielens_train") - paddle.v2.dataset.common.convert(path, test(), 10, "movielens_test") + paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test") if __name__ == '__main__': diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py index e33f120c8734621fd60497298d993e6e43bd06e0..b0b9757c1a75d215cf8945b5cedbb1239fd43af7 100644 --- a/python/paddle/v2/dataset/sentiment.py +++ b/python/paddle/v2/dataset/sentiment.py @@ -137,5 +137,5 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train, 10, "sentiment_train") - paddle.v2.dataset.common.convert(path, test, 10, "sentiment_test") + paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train") + paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test") diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py index ec10ce646ebf3eca2c2a6423b69ee11b6a2b99cf..ce60aa21c2ad1fb8f089d19d548b59a8c806d1ee 100644 --- a/python/paddle/v2/dataset/uci_housing.py +++ b/python/paddle/v2/dataset/uci_housing.py @@ -119,5 +119,5 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train(), 10, "uci_housing_train") - paddle.v2.dataset.common.convert(path, test(), 10, "uci_houseing_test") + paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test") diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index 2a631c365f27a6039021a56268a62017638c2739..95a35d97ce9d9503153974cc167ee60829244d5f 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -169,5 +169,6 @@ def convert(path): Converts dataset to recordio format """ dict_size = 30000 - paddle.v2.dataset.common.convert(path, train(dict_size), 10, "wmt14_train") - paddle.v2.dataset.common.convert(path, test(dict_size), 10, "wmt14_test") + paddle.v2.dataset.common.convert(path, + train(dict_size), 1000, "wmt14_train") + paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test") diff --git a/python/paddle/v2/framework/create_op_creation_methods.py b/python/paddle/v2/framework/create_op_creation_methods.py index b034efffb69030cb09e09ea545e9bff6f1744671..6fd33b366b6d376cc51ba5d663bb04d45ab8c933 100644 --- a/python/paddle/v2/framework/create_op_creation_methods.py +++ b/python/paddle/v2/framework/create_op_creation_methods.py @@ -1,7 +1,7 @@ import paddle.v2.framework.core as core import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2 import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2 -import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2 +import paddle.v2.framework.proto.attribute_pb2 as attribute_pb2 import cStringIO @@ -57,7 +57,7 @@ class OpDescCreationMethod(object): op_desc.attrs.extend([out_format]) if len(tmp_index) != 0: tmp_index_attr = op_desc.attrs.add() - tmp_index_attr.type = attr_type_pb2.INTS + tmp_index_attr.type = attribute_pb2.INTS tmp_index_attr.name = "temporary_index" tmp_index_attr.ints.extend(tmp_index) @@ -73,17 +73,17 @@ class OpDescCreationMethod(object): new_attr = op_desc.attrs.add() new_attr.name = attr.name new_attr.type = attr.type - if attr.type == attr_type_pb2.INT: + if attr.type == attribute_pb2.INT: new_attr.i = user_defined_attr - elif attr.type == attr_type_pb2.FLOAT: + elif attr.type == attribute_pb2.FLOAT: new_attr.f = user_defined_attr - elif attr.type == attr_type_pb2.STRING: + elif attr.type == attribute_pb2.STRING: new_attr.s = user_defined_attr - elif attr.type == attr_type_pb2.INTS: + elif attr.type == attribute_pb2.INTS: new_attr.ints.extend(user_defined_attr) - elif attr.type == attr_type_pb2.FLOATS: + elif attr.type == attribute_pb2.FLOATS: new_attr.floats.extend(user_defined_attr) - elif attr.type == attr_type_pb2.STRINGS: + elif attr.type == attribute_pb2.STRINGS: new_attr.strings.extend(user_defined_attr) else: raise NotImplementedError("Not support attribute type " + @@ -109,7 +109,7 @@ class OpDescCreationMethod(object): retv = [] if multiple: var_format = op_desc_pb2.AttrDesc() - var_format.type = attr_type_pb2.INTS + var_format.type = attribute_pb2.INTS var_format.name = "%s_format" % in_out var_format.ints.append(0) @@ -185,17 +185,17 @@ def get_docstring_from_op_proto(op_proto): for attr in op_proto.attrs: attr_type = None - if attr.type == attr_type_pb2.INT: + if attr.type == attribute_pb2.INT: attr_type = "int" - elif attr.type == attr_type_pb2.FLOAT: + elif attr.type == attribute_pb2.FLOAT: attr_type = "float" - elif attr.type == attr_type_pb2.STRING: + elif attr.type == attribute_pb2.STRING: attr_type = "basestr" - elif attr.type == attr_type_pb2.INTS: + elif attr.type == attribute_pb2.INTS: attr_type = "list of int" - elif attr.type == attr_type_pb2.FLOATS: + elif attr.type == attribute_pb2.FLOATS: attr_type = "list of float" - elif attr.type == attr_type_pb2.STRINGS: + elif attr.type == attribute_pb2.STRINGS: attr_type = "list of basestr" if attr_type is None: diff --git a/python/paddle/v2/framework/default_scope_funcs.py b/python/paddle/v2/framework/default_scope_funcs.py index 4e772326c94b7ee44906c71f13e9420e078a1917..1b5580c8b30f69016f187b1d8710a57b5f7cfa9f 100644 --- a/python/paddle/v2/framework/default_scope_funcs.py +++ b/python/paddle/v2/framework/default_scope_funcs.py @@ -5,7 +5,7 @@ Default scope function. thread-local stack of Scope. Top of that stack is current scope, the bottom of that stack is all scopes' parent. -Invoking `create_var/get_var` can `create/get` variable in current scope. +Invoking `new_var/find_var` can `new/find` variable in current scope. Invoking `enter_local_scope/leave_local_scope` can create or destroy local scope. @@ -19,8 +19,8 @@ import threading __tl_scope__ = threading.local() __all__ = [ - 'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'create_var', - 'get_var', 'scoped_function' + 'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'new_var', + 'find_var', 'scoped_function' ] @@ -33,7 +33,7 @@ def get_cur_scope(): if cur_scope_stack is None: __tl_scope__.cur_scope = list() if len(__tl_scope__.cur_scope) == 0: - __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope(None)) + __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope()) return __tl_scope__.cur_scope[-1] @@ -42,7 +42,7 @@ def enter_local_scope(): Enter a new local scope """ cur_scope = get_cur_scope() - new_scope = paddle.v2.framework.core.Scope(cur_scope) + new_scope = cur_scope.new_scope() __tl_scope__.cur_scope.append(new_scope) @@ -51,20 +51,21 @@ def leave_local_scope(): Leave local scope """ __tl_scope__.cur_scope.pop() + get_cur_scope().drop_kids() -def create_var(name): +def new_var(name): """ create variable in current scope. """ - return get_cur_scope().create_var(name) + return get_cur_scope().new_var(name) -def get_var(name): +def find_var(name): """ get variable in current scope. """ - return get_cur_scope().get_var(name) + return get_cur_scope().find_var(name) def scoped_function(func): diff --git a/python/paddle/v2/framework/network.py b/python/paddle/v2/framework/network.py index c85e87413ef45f40755709e134a277b8d8d1e233..cfeb0e3dec0fd2c6ad4d2d2501f97932495fdd41 100644 --- a/python/paddle/v2/framework/network.py +++ b/python/paddle/v2/framework/network.py @@ -1,6 +1,6 @@ import paddle.v2.framework.core as core from paddle.v2.framework.create_op_creation_methods import op_creations -from default_scope_funcs import create_var, get_var, get_cur_scope +from default_scope_funcs import new_var, find_var, get_cur_scope __all__ = ['Network'] # Only expose Network @@ -29,12 +29,15 @@ class NetworkFunctor(object): if ipt in kwargs: var = kwargs[ipt] if isinstance(var, basestring): - var = create_var(var) + tmp = new_var(var) + self.net.var_names[tmp] = var + var = tmp + if not isinstance(var, core.Variable): raise TypeError( "Input of op creation must be string or variable") - kwargs[ipt] = get_cur_scope().get_var_name(var) + kwargs[ipt] = self.net.var_names[var] notemp_outputs = self.func.all_not_temp_output_args @@ -49,17 +52,20 @@ class NetworkFunctor(object): if opt in kwargs: var = kwargs[opt] if isinstance(var, basestring): - var = create_var(var) + tmp = new_var(var) + self.net.var_names[tmp] = var + var = tmp + if not isinstance(var, core.Variable): raise TypeError( "Output of op creation must be string or variable") - kwargs[opt] = get_cur_scope().get_var_name(var) + kwargs[opt] = self.net.var_names[var] op = self.func(**kwargs) self.net.net.add_op(op) - lst = [get_var(kwargs[opt]) for opt in notemp_outputs] + lst = [find_var(kwargs[opt]) for opt in notemp_outputs] if len(lst) == 1: return lst[0] elif len(lst) == 0: @@ -89,6 +95,7 @@ class Network(object): self.net = core.Net.create() funcs = (func_name for func_name in dir(op_creations) if not func_name.startswith("__")) + self.var_names = dict() # TODO(yuyang18): This code can work, but do not generate a good # docstring, try to give a better way generate function in runtime diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt index deaa350133a88d8c9902b3c120f6583580ccd759..ee6726c31f278331df362616dbc1b6e6d9a8dbee 100644 --- a/python/paddle/v2/framework/tests/CMakeLists.txt +++ b/python/paddle/v2/framework/tests/CMakeLists.txt @@ -8,10 +8,11 @@ add_python_test(test_framework test_fc_op.py test_add_two_op.py test_sgd_op.py - test_cross_entropy_op.py test_mul_op.py + test_mean_op.py test_sigmoid_op.py test_softmax_op.py test_rowwise_add_op.py test_random_op.py - test_network.py) + test_network.py + gradient_checker.py) diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..4022de1c40e41aa77a7f31d82b55b63585cbd5f5 --- /dev/null +++ b/python/paddle/v2/framework/tests/gradient_checker.py @@ -0,0 +1,90 @@ +import paddle.v2.framework.core as core +from paddle.v2.framework.create_op_creation_methods import op_creations +import numpy +import unittest + +__all__ = ['get_numeric_gradient'] + + +def get_numeric_gradient(op, + input_values, + output_name, + input_to_check, + delta=1e-2, + local_scope=None): + """ + Get Numeric Gradient for an operator's input. + + :param op: C++ operator instance, could be an network + :param input_values: The input variables. Should be an dictionary, key is + variable name. Value is numpy array. + :param output_name: The final output variable name. + :param input_to_check: The input variable need to get gradient. + :param delta: The perturbation value for numeric gradient method. The + smaller delta is, the more accurate result will get. But if that delta is + too small, it could occur numerical stability problem. + :param local_scope: The local scope used for get_numeric_gradient. + :return: The gradient array in numpy format. + """ + if local_scope is None: + local_scope = core.Scope() + + # Create all input variable in local_scope + for var_name in input_values: + var = local_scope.new_var(var_name) + tensor = var.get_tensor() + tensor.set_dims(input_values[var_name].shape) + tensor.alloc_float(core.CPUPlace()) + tensor.set(input_values[var_name], core.CPUPlace()) + + # Create all output variable in local_scope + for output in op.outputs(): + if local_scope.find_var(output) is None: + local_scope.new_var(output).get_tensor() + + op.infer_shape(local_scope) + + # allocate output memory + for output in op.outputs(): + local_scope.find_var(output).get_tensor().alloc_float(core.CPUPlace()) + + # TODO(yuyang18): Only CPU is support now. + cpu_ctx = core.DeviceContext.create(core.CPUPlace()) + + def get_output(): + op.run(local_scope, cpu_ctx) + return numpy.array(local_scope.find_var(output_name).get_tensor()).sum() + + def product(dim): + return reduce(lambda a, b: a * b, dim, 1) + + tensor_to_check = local_scope.find_var(input_to_check).get_tensor() + tensor_size = product(tensor_to_check.get_dims()) + gradient_flat = numpy.zeros(shape=(tensor_size, ), dtype='float32') + for i in xrange(tensor_size): + origin = tensor_to_check.get_float_element(i) + x_pos = origin + delta + tensor_to_check.set_float_element(i, x_pos) + y_pos = get_output() + + x_neg = origin - delta + tensor_to_check.set_float_element(i, x_neg) + y_neg = get_output() + + tensor_to_check.set_float_element(i, origin) # restore old value + gradient_flat[i] = (y_pos - y_neg) / delta / 2 + return gradient_flat.reshape(tensor_to_check.get_dims()) + + +if __name__ == '__main__': + + class GetNumericGradientTest(unittest.TestCase): + def test_add_op(self): + add_op = op_creations.add_two(X="X", Y="Y", Out="Z") + x = numpy.random.random((10, 1)).astype("float32") + y = numpy.random.random((10, 1)).astype("float32") + + arr = get_numeric_gradient(add_op, {'X': x, "Y": y}, 'Z', 'X') + self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-2) + + unittest.main() diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py index 7b62313f8aca5e9f515d1a9e6df3bb6f51b974fb..9ee66c2c5103811519c3a2c28653536f97009161 100644 --- a/python/paddle/v2/framework/tests/op_test_util.py +++ b/python/paddle/v2/framework/tests/op_test_util.py @@ -24,42 +24,44 @@ class OpTestMeta(type): func = getattr(creation.op_creations, self.type, None) self.assertIsNotNone(func) - scope = core.Scope(None) + scope = core.Scope() kwargs = dict() + places = [] + places.append(core.CPUPlace()) + if core.is_compile_gpu(): + places.append(core.GPUPlace(0)) - for in_name in func.all_input_args: - if hasattr(self, in_name): - kwargs[in_name] = in_name - var = scope.create_var(in_name).get_tensor() - arr = getattr(self, in_name) - var.set_dims(arr.shape) - var.set(arr) - else: - kwargs[in_name] = "@EMPTY@" + for place in places: + for in_name in func.all_input_args: + if hasattr(self, in_name): + kwargs[in_name] = in_name + var = scope.new_var(in_name).get_tensor() + arr = getattr(self, in_name) + var.set_dims(arr.shape) + var.set(arr, place) + else: + kwargs[in_name] = "@EMPTY@" - for out_name in func.all_output_args: - if hasattr(self, out_name): - kwargs[out_name] = out_name - scope.create_var(out_name).get_tensor() + for out_name in func.all_output_args: + if hasattr(self, out_name): + kwargs[out_name] = out_name + scope.new_var(out_name).get_tensor() - for attr_name in func.all_attr_args: - if hasattr(self, attr_name): - kwargs[attr_name] = getattr(self, attr_name) + for attr_name in func.all_attr_args: + if hasattr(self, attr_name): + kwargs[attr_name] = getattr(self, attr_name) - op = func(**kwargs) + op = func(**kwargs) - op.infer_shape(scope) + op.infer_shape(scope) - ctx = core.DeviceContext.cpu_context() - op.run(scope, ctx) + ctx = core.DeviceContext.create(place) + op.run(scope, ctx) - for out_name in func.all_output_args: - actual = numpy.array(scope.get_var(out_name).get_tensor()) - expect = getattr(self, out_name) - # TODO(qijun) The default decimal is 7, but numpy.dot and eigen.mul - # has some diff, and could not pass unittest. So I set decimal 3 here. - # And I will check this in future. - numpy.testing.assert_almost_equal(actual, expect, decimal=3) + for out_name in func.all_output_args: + actual = numpy.array(scope.find_var(out_name).get_tensor()) + expect = getattr(self, out_name) + numpy.isclose(actual, expect) obj.test_all = test_all return obj diff --git a/python/paddle/v2/framework/tests/test_add_two_op.py b/python/paddle/v2/framework/tests/test_add_two_op.py index a06d7a78ecf838a49e5f2808d3686c6b92faa8ce..6e6643201bf361fce1bad7de10b2562f0525e00a 100644 --- a/python/paddle/v2/framework/tests/test_add_two_op.py +++ b/python/paddle/v2/framework/tests/test_add_two_op.py @@ -1,6 +1,10 @@ import unittest -from op_test_util import OpTestMeta + import numpy +import paddle.v2.framework.core as core +import paddle.v2.framework.create_op_creation_methods as creation + +from op_test_util import OpTestMeta class TestAddOp(unittest.TestCase): @@ -8,10 +12,19 @@ class TestAddOp(unittest.TestCase): def setUp(self): self.type = "add_two" - self.X = numpy.random.random((342, 345)).astype("float32") - self.Y = numpy.random.random((342, 345)).astype("float32") + self.X = numpy.random.random((102, 105)).astype("float32") + self.Y = numpy.random.random((102, 105)).astype("float32") self.Out = self.X + self.Y +class TestAddGradOp(unittest.TestCase): + def test_add_grad(self): + op = creation.op_creations.add_two(X="X", Y="Y", Out="Out") + backward_op = core.Operator.backward(op, set()) + self.assertEqual(backward_op.type(), "add_two_grad") + expected = '''Op(add_two_grad), inputs:(X, Y, Out, Out@GRAD), outputs:(X@GRAD, Y@GRAD).''' + self.assertEqual(expected, str(backward_op)) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py index 609c56535ef0365dda728cba334d8b4d96312192..6d022f6bc0be60dbf2f796780a969bff0e8bfded 100644 --- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py +++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py @@ -18,5 +18,7 @@ class TestSGD(unittest.TestCase): self.Y = numpy.array(Y).astype("float32") +# TODO(superjom) add gradient check + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/v2/framework/tests/test_default_scope_funcs.py b/python/paddle/v2/framework/tests/test_default_scope_funcs.py index 81033deb1546c81e2566ec5474f45dc56781644a..495863c4562b5a2d6755fb02e21a6b0c845fd7b6 100644 --- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py +++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py @@ -7,19 +7,19 @@ class TestDefaultScopeFuncs(unittest.TestCase): self.assertIsNotNone(get_cur_scope()) def test_none_variable(self): - self.assertIsNone(get_var("test")) + self.assertIsNone(find_var("test")) def test_create_var_get_var(self): - var_a = create_var("var_a") + var_a = new_var("var_a") self.assertIsNotNone(var_a) - self.assertIsNotNone(get_cur_scope().get_var('var_a')) + self.assertIsNotNone(get_cur_scope().find_var('var_a')) enter_local_scope() - self.assertIsNotNone(get_cur_scope().get_var('var_a')) + self.assertIsNotNone(get_cur_scope().find_var('var_a')) leave_local_scope() def test_var_get_int(self): def __new_scope__(): - i = create_var("var_i") + i = new_var("var_i") self.assertFalse(i.is_int()) i.set_int(10) self.assertTrue(i.is_int()) diff --git a/python/paddle/v2/framework/tests/test_fc_op.py b/python/paddle/v2/framework/tests/test_fc_op.py index 59e7e61249e2a7d49a17e5d87209f03b8f35f730..00dc4399aaf59e6382692c3a4356f89a7e79a0c5 100644 --- a/python/paddle/v2/framework/tests/test_fc_op.py +++ b/python/paddle/v2/framework/tests/test_fc_op.py @@ -6,18 +6,20 @@ import paddle.v2.framework.create_op_creation_methods as creation class TestFc(unittest.TestCase): def test_fc(self): - scope = core.Scope(None) - x = scope.create_var("X") + scope = core.Scope() + place = core.CPUPlace() + x = scope.new_var("X") + x_tensor = x.get_tensor() x_tensor.set_dims([1000, 784]) - x_tensor.alloc_float() + x_tensor.alloc_float(place) - w = scope.create_var("W") + w = scope.new_var("W") w_tensor = w.get_tensor() w_tensor.set_dims([784, 100]) - w_tensor.alloc_float() + w_tensor.alloc_float(place) - w_tensor.set(numpy.random.random((784, 100)).astype("float32")) + w_tensor.set(numpy.random.random((784, 100)).astype("float32"), place) # Set a real numpy array here. # x_tensor.set(numpy.array([])) @@ -25,14 +27,14 @@ class TestFc(unittest.TestCase): op = creation.op_creations.fc(X="X", Y="Y", W="W") for out in op.outputs(): - if scope.get_var(out) is None: - scope.create_var(out).get_tensor() + if scope.find_var(out) is None: + scope.new_var(out).get_tensor() - tensor = scope.get_var("Y").get_tensor() + tensor = scope.find_var("Y").get_tensor() op.infer_shape(scope) self.assertEqual([1000, 100], tensor.shape()) - ctx = core.DeviceContext.cpu_context() + ctx = core.DeviceContext.create(place) op.run(scope, ctx) diff --git a/python/paddle/v2/framework/tests/test_mean_op.py b/python/paddle/v2/framework/tests/test_mean_op.py new file mode 100644 index 0000000000000000000000000000000000000000..78fff1eeff998109a51ea662f963a102eff49d3a --- /dev/null +++ b/python/paddle/v2/framework/tests/test_mean_op.py @@ -0,0 +1,16 @@ +import unittest +from op_test_util import OpTestMeta +import numpy as np + + +class TestMeanOp(unittest.TestCase): + __metaclass__ = OpTestMeta + + def setUp(self): + self.type = "mean" + self.X = np.random.random((32, 784)).astype("float32") + self.Out = np.mean(self.X) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py index 0a87e66cd03af1bf84be8ffe111e4a8c3a24d6dc..e1ac66d3a4d23d617f7c5a4d97d070b2660954c8 100644 --- a/python/paddle/v2/framework/tests/test_mul_op.py +++ b/python/paddle/v2/framework/tests/test_mul_op.py @@ -8,8 +8,8 @@ class TestMulOp(unittest.TestCase): def setUp(self): self.type = "mul" - self.X = np.random.random((32, 784)).astype("float32") - self.Y = np.random.random((784, 100)).astype("float32") + self.X = np.random.random((32, 84)).astype("float32") + self.Y = np.random.random((84, 100)).astype("float32") self.Out = np.dot(self.X, self.Y) diff --git a/python/paddle/v2/framework/tests/test_op_creation_methods.py b/python/paddle/v2/framework/tests/test_op_creation_methods.py index 41db7c0d535aa920b34d6cc346090a8c15bfb110..1d2ce6d0717bfb45355fe0cabc516a598492d518 100644 --- a/python/paddle/v2/framework/tests/test_op_creation_methods.py +++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py @@ -3,7 +3,7 @@ import paddle.v2.framework.create_op_creation_methods as creation import paddle.v2.framework.core as core import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2 import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2 -import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2 +import paddle.v2.framework.proto.attribute_pb2 as attribute_pb2 class TestGetAllProtos(unittest.TestCase): @@ -76,7 +76,7 @@ class TestOpDescCreationMethod(unittest.TestCase): expected1.type = 'fc' attr = expected1.attrs.add() attr.name = 'input_format' - attr.type = attr_type_pb2.INTS + attr.type = attribute_pb2.INTS attr.ints.extend([0, 1, 2, 3]) self.assertEqual(expected1, generated1) @@ -88,7 +88,7 @@ class TestOpDescCreationMethod(unittest.TestCase): expected2.type = 'fc' attr = expected2.attrs.add() attr.name = 'input_format' - attr.type = attr_type_pb2.INTS + attr.type = attribute_pb2.INTS attr.ints.extend([0, 3, 6, 7]) self.assertEqual(expected2, generated2) @@ -105,12 +105,12 @@ class TestOpDescCreationMethod(unittest.TestCase): attr.comment = "" attr.type = type - __add_attr__("int_attr", attr_type_pb2.INT) - __add_attr__("float_attr", attr_type_pb2.FLOAT) - __add_attr__("string_attr", attr_type_pb2.STRING) - __add_attr__("ints_attr", attr_type_pb2.INTS) - __add_attr__("floats_attr", attr_type_pb2.FLOATS) - __add_attr__("strings_attr", attr_type_pb2.STRINGS) + __add_attr__("int_attr", attribute_pb2.INT) + __add_attr__("float_attr", attribute_pb2.FLOAT) + __add_attr__("string_attr", attribute_pb2.STRING) + __add_attr__("ints_attr", attribute_pb2.INTS) + __add_attr__("floats_attr", attribute_pb2.FLOATS) + __add_attr__("strings_attr", attribute_pb2.STRINGS) op.comment = "" self.assertTrue(op.IsInitialized()) @@ -131,32 +131,32 @@ class TestOpDescCreationMethod(unittest.TestCase): expected.inputs.extend(['a']) attr = expected.attrs.add() attr.name = "int_attr" - attr.type = attr_type_pb2.INT + attr.type = attribute_pb2.INT attr.i = 10 attr = expected.attrs.add() attr.name = "float_attr" - attr.type = attr_type_pb2.FLOAT + attr.type = attribute_pb2.FLOAT attr.f = 3.2 attr = expected.attrs.add() attr.name = "string_attr" - attr.type = attr_type_pb2.STRING + attr.type = attribute_pb2.STRING attr.s = "test_str" attr = expected.attrs.add() attr.name = "ints_attr" - attr.type = attr_type_pb2.INTS + attr.type = attribute_pb2.INTS attr.ints.extend([0, 1, 2, 3, 4]) attr = expected.attrs.add() attr.name = 'floats_attr' - attr.type = attr_type_pb2.FLOATS + attr.type = attribute_pb2.FLOATS attr.floats.extend([0.2, 3.2, 4.5]) attr = expected.attrs.add() attr.name = 'strings_attr' - attr.type = attr_type_pb2.STRINGS + attr.type = attribute_pb2.STRINGS attr.strings.extend(['a', 'b', 'c']) self.assertEqual(expected, generated) @@ -185,7 +185,7 @@ class TestOpDescCreationMethod(unittest.TestCase): desc.type = "test" attr = desc.attrs.add() attr.name = "temporary_index" - attr.type = attr_type_pb2.INTS + attr.type = attribute_pb2.INTS attr.ints.append(2) self.assertEqual(generated, desc) @@ -219,7 +219,7 @@ This op is used for unit test, not a real op. test_str = op.attrs.add() test_str.name = "str_attr" - test_str.type = attr_type_pb2.STRING + test_str.type = attribute_pb2.STRING test_str.comment = "A string attribute for test op" actual = creation.get_docstring_from_op_proto(op) diff --git a/python/paddle/v2/framework/tests/test_protobuf.py b/python/paddle/v2/framework/tests/test_protobuf.py index b8702477e64203e735bff05b115eafbb2a52172d..69e98e2f250a9df23b25e7e2043af29f87c996a0 100644 --- a/python/paddle/v2/framework/tests/test_protobuf.py +++ b/python/paddle/v2/framework/tests/test_protobuf.py @@ -1,12 +1,10 @@ -import paddle.v2.framework.proto.op_proto_pb2 -import paddle.v2.framework.proto.attr_type_pb2 +import paddle.v2.framework.proto.op_proto_pb2 as op_proto_lib +import paddle.v2.framework.proto.attribute_pb2 as attr_type_lib import unittest class TestFrameworkProto(unittest.TestCase): def test_all(self): - op_proto_lib = paddle.v2.framework.proto.op_proto_pb2 - attr_type_lib = paddle.v2.framework.proto.attr_type_pb2 op_proto = op_proto_lib.OpProto() ipt0 = op_proto.inputs.add() ipt0.name = "a" diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py index ef1514983c03f822f84b85437d1cfe653b6a1a2e..04abc14ee198fe4e2307e009c696a2b40ec271b6 100644 --- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py +++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py @@ -8,8 +8,8 @@ class TestRowwiseAddOp(unittest.TestCase): def setUp(self): self.type = "rowwise_add" - self.X = np.random.random((32, 784)).astype("float32") - self.b = np.random.random(784).astype("float32") + self.X = np.random.random((32, 84)).astype("float32") + self.b = np.random.random(84).astype("float32") self.Out = np.add(self.X, self.b) diff --git a/python/paddle/v2/framework/tests/test_scope.py b/python/paddle/v2/framework/tests/test_scope.py index f0ee45cfc75e486c693a00d92a97ac0970195581..1ce9454067f91f39f01d9eb4c912857464a3c1cb 100644 --- a/python/paddle/v2/framework/tests/test_scope.py +++ b/python/paddle/v2/framework/tests/test_scope.py @@ -5,29 +5,29 @@ import unittest class TestScope(unittest.TestCase): def test_create_destroy(self): paddle_c = paddle.v2.framework.core - scope = paddle_c.Scope(None) + scope = paddle_c.Scope() self.assertIsNotNone(scope) - scope_with_parent = paddle_c.Scope(scope) + scope_with_parent = scope.new_scope() self.assertIsNotNone(scope_with_parent) def test_none_variable(self): paddle_c = paddle.v2.framework.core - scope = paddle_c.Scope(None) - self.assertIsNone(scope.get_var("test")) + scope = paddle_c.Scope() + self.assertIsNone(scope.find_var("test")) def test_create_var_get_var(self): paddle_c = paddle.v2.framework.core - scope = paddle_c.Scope(None) - var_a = scope.create_var("var_a") + scope = paddle_c.Scope() + var_a = scope.new_var("var_a") self.assertIsNotNone(var_a) - self.assertIsNotNone(scope.get_var('var_a')) - scope2 = paddle_c.Scope(scope) - self.assertIsNotNone(scope2.get_var('var_a')) + self.assertIsNotNone(scope.find_var('var_a')) + scope2 = scope.new_scope() + self.assertIsNotNone(scope2.find_var('var_a')) def test_var_get_int(self): paddle_c = paddle.v2.framework.core - scope = paddle_c.Scope(None) - var = scope.create_var("test_int") + scope = paddle_c.Scope() + var = scope.new_var("test_int") var.set_int(10) self.assertTrue(var.is_int()) self.assertEqual(10, var.get_int()) diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/framework/tests/test_sgd_op.py index 405d73b224fa153e50b4ec408a921f2bdaab46aa..ca03cc11abe2ceb31b33a87797aa752943dd2a7d 100644 --- a/python/paddle/v2/framework/tests/test_sgd_op.py +++ b/python/paddle/v2/framework/tests/test_sgd_op.py @@ -8,8 +8,8 @@ class TestSGD(unittest.TestCase): def setUp(self): self.type = "sgd" - self.param = numpy.random.random((342, 345)).astype("float32") - self.grad = numpy.random.random((342, 345)).astype("float32") + self.param = numpy.random.random((102, 105)).astype("float32") + self.grad = numpy.random.random((102, 105)).astype("float32") self.learning_rate = 0.1 self.param_out = self.param - self.learning_rate * self.grad diff --git a/python/paddle/v2/framework/tests/test_softmax_op.py b/python/paddle/v2/framework/tests/test_softmax_op.py index 191b698c1cdec9b86b4ded6b1f743586867ca62f..c80888128781d98e4ed30d845a30b39121f66459 100644 --- a/python/paddle/v2/framework/tests/test_softmax_op.py +++ b/python/paddle/v2/framework/tests/test_softmax_op.py @@ -1,6 +1,10 @@ import unittest -from op_test_util import OpTestMeta + import numpy as np +import paddle.v2.framework.core as core +import paddle.v2.framework.create_op_creation_methods as creation + +from op_test_util import OpTestMeta def stable_softmax(x): @@ -19,5 +23,63 @@ class TestSoftmaxOp(unittest.TestCase): self.Y = np.apply_along_axis(stable_softmax, 1, self.X) +class TestSoftmaxGradOp(unittest.TestCase): + def test_softmax_grad(self): + op = creation.op_creations.softmax(X="X", Y="Y") + backward_op = core.Operator.backward(op, set()) + self.assertEqual(backward_op.type(), "softmax_grad") + expected = '''Op(softmax_grad), inputs:(X, Y, Y@GRAD), outputs:(X@GRAD).''' + self.assertEqual(expected, str(backward_op)) + + batch_size = 3 + class_num = 5 + # Initialize X and add 1e-2 for numerical stability + Y = np.random.rand(batch_size, class_num).astype(np.float32) + Y = Y + 1e-2 + dY = np.random.rand(batch_size, class_num).astype(np.float32) + + # Reference implementation of cross entropy with soft labels + def label_softmax_grad(Y, dY): + dX = Y * 0.0 + for i in range(batch_size): + d = np.dot(Y[i, :], dY[i, :]) + dX[i, :] = Y[i, :] * (dY[i, :] - d) + return dX + + expected = label_softmax_grad(Y, dY) + + scope = core.Scope() + places = [] + places.append(core.CPUPlace()) + if core.is_compile_gpu(): + places.append(core.GPUPlace(0)) + + for place in places: + y = scope.new_var("Y") + y_tensor = y.get_tensor() + y_tensor.set_dims([batch_size, class_num]) + y_tensor.alloc_float(place) + y_tensor.set(Y, place) + + dy = scope.new_var("Y@GRAD") + dy_tensor = dy.get_tensor() + dy_tensor.set_dims([batch_size, class_num]) + dy_tensor.alloc_float(place) + dy_tensor.set(dY, place) + + x = scope.new_var("X") + dx = scope.new_var("X@GRAD") + + tensor = scope.find_var("X@GRAD").get_tensor() + backward_op.infer_shape(scope) + self.assertEqual([batch_size, class_num], tensor.shape()) + + ctx = core.DeviceContext.create(place) + backward_op.run(scope, ctx) + actual = np.array(tensor) + + np.testing.assert_almost_equal(actual, expected, decimal=3) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_tensor.py b/python/paddle/v2/framework/tests/test_tensor.py index b72aff3b9cd16595c7e81856642196b2bb61a790..1af39818a305215b45219b8c5f0a10630fd64279 100644 --- a/python/paddle/v2/framework/tests/test_tensor.py +++ b/python/paddle/v2/framework/tests/test_tensor.py @@ -5,36 +5,39 @@ import numpy class TestScope(unittest.TestCase): def test_int_tensor(self): - scope = core.Scope(None) - var = scope.create_var("test_tensor") + scope = core.Scope() + var = scope.new_var("test_tensor") + place = core.CPUPlace() + tensor = var.get_tensor() tensor.set_dims([1000, 784]) - tensor.alloc_int() - + tensor.alloc_int(place) tensor_array = numpy.array(tensor) self.assertEqual((1000, 784), tensor_array.shape) tensor_array[3, 9] = 1 tensor_array[19, 11] = 2 - tensor.set(tensor_array) + tensor.set(tensor_array, place) tensor_array_2 = numpy.array(tensor) self.assertEqual(1.0, tensor_array_2[3, 9]) self.assertEqual(2.0, tensor_array_2[19, 11]) def test_float_tensor(self): - scope = core.Scope(None) - var = scope.create_var("test_tensor") + scope = core.Scope() + var = scope.new_var("test_tensor") + place = core.CPUPlace() + tensor = var.get_tensor() tensor.set_dims([1000, 784]) - tensor.alloc_float() + tensor.alloc_float(place) tensor_array = numpy.array(tensor) self.assertEqual((1000, 784), tensor_array.shape) tensor_array[3, 9] = 1.0 tensor_array[19, 11] = 2.0 - tensor.set(tensor_array) + tensor.set(tensor_array, place) tensor_array_2 = numpy.array(tensor) self.assertAlmostEqual(1.0, tensor_array_2[3, 9]) diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py index b658a81630733fea3976b812afe819d76de4cb25..fc718f031e2267e737adbc340226e145bf614bf2 100644 --- a/python/paddle/v2/master/client.py +++ b/python/paddle/v2/master/client.py @@ -76,3 +76,6 @@ class client(object): # Memory created from C should be freed. get_c_lib().mem_free(ret.contents) return record, 0 + + def paddle_start_get_records(self, pass_id): + get_c_lib().paddle_start_get_records(self.c, pass_id) diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py index 55a0fcdf56af7a8c9bee3255ea6f1d1ae1b34893..d0f18e4b6611fa56654e7f2a0144758339cb9e19 100644 --- a/python/paddle/v2/reader/creator.py +++ b/python/paddle/v2/reader/creator.py @@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could be used in user program. """ -__all__ = ['np_array', 'text_file', "recordio"] +__all__ = ['np_array', 'text_file', "cloud_reader"] def np_array(x): @@ -81,35 +81,41 @@ def recordio_local(paths, buf_size=100): return dec.buffered(reader, buf_size) -def recordio(paths, buf_size=100): +pass_num = 0 + + +def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64): """ - Creates a data reader that outputs record one one by one - from given local or cloud recordio path. + Create a data reader that yield a record one by one from + the paths: :path: path of recordio files. + :etcd_endpoints: the endpoints for etcd cluster :returns: data reader of recordio files. + + .. code-block:: python + from paddle.v2.reader.creator import cloud_reader + etcd_endpoints = "http://127.0.0.1:2379" + trainer.train.( + reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints), + ) """ import os - import paddle.v2.master.client as cloud - - if "KUBERNETES_SERVICE_HOST" not in os.environ.keys(): - return recordio_local(paths) - - host_name = "MASTER_SERVICE_HOST" - if host_name not in os.environ.keys(): - raise Exception('not find ' + host_name + ' in environment variable.') - - addr = os.environ(host) + import cPickle as pickle + import paddle.v2.master as master + c = master.client(etcd_endpoints, timeout_sec, buf_size) + c.set_dataset(paths) def reader(): - c = cloud(addr, buf_size) - c.set_dataset(paths) + global pass_num + c.paddle_start_get_records(pass_num) + pass_num += 1 while True: - r, err = client.next_record() - if err < 0: + r, e = c.next_record() + if not r: + if e != -2: + print "get record error: ", e break - yield r - - c.release() + yield pickle.loads(r) return reader diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py index b42d273ecfe6c4bc5706ec52617960b83496d70d..359f3eeefbe8efeb343cc875c707c9251a7087fb 100644 --- a/python/paddle/v2/reader/tests/creator_test.py +++ b/python/paddle/v2/reader/tests/creator_test.py @@ -34,14 +34,5 @@ class TestTextFile(unittest.TestCase): self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1)) -class TestRecordIO(unittest.TestCase): - def test_recordio(self): - path = os.path.join( - os.path.dirname(__file__), "test_recordio_creator.dat") - reader = paddle.v2.reader.creator.recordio([path]) - for idx, r in enumerate(reader()): - self.assertSequenceEqual(r, str(idx)) - - if __name__ == '__main__': unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 65a26940d4d703ea4fbb5022523a90716982ec10..7808238aa6ba5ca5c13292638f1c513f87cc2af2 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -14,7 +14,7 @@ packages=['paddle', 'paddle.v2.framework.proto'] setup_requires=["requests", - "numpy", + "numpy>=1.12", "protobuf==3.1", "recordio", "matplotlib",