diff --git a/CMakeLists.txt b/CMakeLists.txt index fd3582a1bca199d62d19550ffdd1efe9db520fa7..9e30dff70fed51b604059610b22057349f22db58 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,8 +36,7 @@ include(simd) ################################ Configurations ####################################### option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) -option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND}) -option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND}) +option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) @@ -82,10 +81,8 @@ if(ANDROID OR IOS) "Disable PYTHON when cross-compiling for Android and iOS" FORCE) set(WITH_RDMA OFF CACHE STRING "Disable RDMA when cross-compiling for Android and iOS" FORCE) - set(WITH_MKLDNN OFF CACHE STRING - "Disable MKLDNN when cross-compiling for Android and iOS" FORCE) - set(WITH_MKLML OFF CACHE STRING - "Disable MKLML package when cross-compiling for Android and iOS" FORCE) + set(WITH_MKL OFF CACHE STRING + "Disable MKL when cross-compiling for Android and iOS" FORCE) # Compile PaddlePaddle mobile inference library if (NOT WITH_C_API) @@ -111,6 +108,17 @@ else() set(THIRD_PARTY_BUILD_TYPE Release) endif() +if(WITH_MKL) + set(WITH_MKLML ON) + set(WITH_MKLDNN ${AVX2_FOUND}) + if(NOT WITH_MKLDNN) + message(WARNING "Do not have AVX2 intrinsics and disabled MKL-DNN") + endif() +else() + set(WITH_MKLML OFF) + set(WITH_MKLDNN OFF) +endif() + ######################################################################################## include(external/mklml) # download mklml package @@ -164,8 +172,12 @@ if(WITH_GPU) endif(NOT WITH_DSO) endif(WITH_GPU) +if(WITH_MKLML) + list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB}) +endif() + if(WITH_MKLDNN) - list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB}) + list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB}) endif() if(USE_NNPACK) diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn.sh index a4527e04968cf8c8c3c31d16f50bc3e28381f6d8..3cc779b48d082985f75ab1c053fbe262bc6d58aa 100755 --- a/benchmark/paddle/image/run_mkldnn.sh +++ b/benchmark/paddle/image/run_mkldnn.sh @@ -1,9 +1,7 @@ set -e function train() { - unset OMP_NUM_THREADS MKL_NUM_THREADS - export OMP_DYNAMIC="FALSE" - export KMP_AFFINITY="granularity=fine,compact,0,0" + unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY topology=$1 layer_num=$2 bs=$3 @@ -14,8 +12,6 @@ function train() { elif [ $4 == "False" ]; then thread=`nproc` # each trainer_count use only 1 core to avoid conflict - export OMP_NUM_THREADS=1 - export MKL_NUM_THREADS=1 log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log" else echo "Wrong input $3, use True or False." diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 24ddb24399dabeec9b8e5faf36be3eb21f420111..e550ec285668ea25757eeee9e7c5dc48fc9d339d 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -76,27 +76,14 @@ else() include_directories(${CUDA_TOOLKIT_INCLUDE}) endif(NOT WITH_GPU) -if(WITH_MKLDNN) - add_definitions(-DPADDLE_USE_MKLDNN) - if (WITH_MKLML AND MKLDNN_IOMP_DIR) - message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}") - set(OPENMP_FLAGS "-fopenmp") - set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) - set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") - else() - find_package(OpenMP) - if(OPENMP_FOUND) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - else() - message(WARNING "Can not find OpenMP." - "Some performance features in MKLDNN may not be available") - endif() - endif() - -endif(WITH_MKLDNN) +if (WITH_MKLML AND MKLML_IOMP_LIB) + message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") + set(OPENMP_FLAGS "-fopenmp") + set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") +endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake index 310450f7d009dc0cdae9c0079a96445af8ec8f95..d3f5bf6852b3b295f3b5806b0577a880b0ce6ba6 100644 --- a/cmake/cross_compiling/ios.cmake +++ b/cmake/cross_compiling/ios.cmake @@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform") # Set the architecture for iOS if(NOT DEFINED IOS_ARCH) if(IOS_PLATFORM STREQUAL "OS") - # FIXME(liuyiqun): support "armv7;armv7s;arm64" future - set(IOS_ARCH "arm64") + set(IOS_ARCH "armv7;armv7s;arm64") elseif(IOS_PLATFORM STREQUAL "SIMULATOR") - # FIXME(liuyiqun): support "i386;x86_64" future - set(IOS_ARCH "x86_64") + set(IOS_ARCH "i386;x86_64") endif() endif() set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") @@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_ # Hidden visibilty is required for cxx on iOS set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags") -set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags") +set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags") set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first") diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 5a06825beb73e85d8a55b7b578b187bee2c4340c..fc52d339d7a336b44c97f2e0a9fc8d6604854365 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) IF(${CBLAS_PROVIDER} STREQUAL "MKLML") SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) - SET(MKLDNN_MKLROOT ${MKLML_ROOT}) - SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB}) - SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR}) - MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}") + MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}") +ELSE() + MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN") ENDIF() SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow") @@ -57,15 +56,16 @@ ExternalProject_Add( PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} - CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT} + CMAKE_ARGS -DMKLROOT=${MKLML_ROOT} CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} - -DMKLROOT:PATH=${MKLDNN_MKLROOT} + -DMKLROOT:PATH=${MKLML_ROOT} ) ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) -MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}") +MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") +add_definitions(-DPADDLE_USE_MKLDNN) LIST(APPEND external_project_dependencies mkldnn) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 324e29f931ecbb6beab2d363daa01a19b1a56b3e..4c4f59656dae68739f2f07f3febd510e727fe2dd 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND}) "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "openblas library." FORCE) - SET(OPENBLAS_CC "${CMAKE_C_COMPILER}") + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") IF(CMAKE_CROSSCOMPILING) SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER}) @@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND}) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0) ENDIF() ELSEIF(IOS) - # FIXME(liuyiqun): support multiple architectures - SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") - SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") - IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7") - SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7") - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0) - ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64") + IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64") + SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") + SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64") SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX}) + ELSE() + MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. " + "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.") ENDIF() ELSEIF(RPI) # use hardfp diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 8bd058222880b4df3b08da09c02f9fe7f1d0ee66..a8e1aca49c97df256b1269c286b0bce7732fa932 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +IF(MOBILE_INFERENCE) + return() +ENDIF() + INCLUDE(ExternalProject) SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc) diff --git a/cmake/util.cmake b/cmake/util.cmake index 117ab7f49cdf4a568cd203b2b17767643d0b2d50..ad905ab55ba3537054fa5b30b5fca4d83c406702 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME) target_link_libraries(${TARGET_NAME} log) endif(ANDROID) - if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR) - target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed") + if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB) + target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") endif() add_dependencies(${TARGET_NAME} ${external_project_dependencies}) diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index 3bb52707974d633d7933f16592354455a5d451cc..d4d182f6692e09b3e40f3620b77d9a0f20ec5af3 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -335,6 +335,16 @@ bilinear_interp .. autoclass:: paddle.v2.layer.bilinear_interp :noindex: +dot_prod +--------- +.. autoclass:: paddle.v2.layer.dot_prod + :noindex: + +out_prod +-------- +.. autoclass:: paddle.v2.layer.out_prod + :noindex: + power ----- .. autoclass:: paddle.v2.layer.power diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD index 16236763a73770f3fe5eadf67645765d0456f875..ec6d4681836e189f46dbb9b915a237dc15cda7cf 100644 --- a/doc/design/mkldnn/README.MD +++ b/doc/design/mkldnn/README.MD @@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA. 我们把集成方案大致分为了如下几个方面。 ### CMake -我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项,当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。 +我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关,他是负责`WITH_MKLML`和`WITH_MKLDNN`的总开关。 -同时,我们会引入`WITH_MKLML`选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。 +当打开`WITH_MKL`时,会开启MKLML的功能,作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上,同时会开启MKL-DNN功能。 -所以,我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。 +当关闭`WITH_MKL`时,MKLML和MKL-DNN功能会同时关闭。 -**备注**:当`WITH_MKLML=ON`的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动`cmake/cblas.cmake`中的逻辑。 +所以,我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。 ### Layers 所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在 diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst index 731a63f945c29ba78538b3d71289b234e569354d..61f3a223547b352cf7929615cf3682b29b9a738f 100644 --- a/doc/howto/dev/write_docs_cn.rst +++ b/doc/howto/dev/write_docs_cn.rst @@ -34,7 +34,7 @@ PaddlePaddle的文档构建有两种方式。 cd TO_YOUR_PADDLE_CLONE_PATH mkdir -p build cd build - cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON + cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON make gen_proto_py make paddle_docs paddle_docs_cn diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md index 882066f23714f7ab3bba9199b5fa5ff2325ce849..424d7718c64438496cf0895397babd5408e1ca02 100644 --- a/doc/mobile/cross_compiling_for_android_cn.md +++ b/doc/mobile/cross_compiling_for_android_cn.md @@ -1,4 +1,4 @@ -# 构建Android平台上的PaddlePaddle库 +# Android平台编译指南 用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库: - 基于Docker容器的编译方式 diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md index cda636a67de712e072f4cc7ad859dda75211eaa8..9da48e7f2119ce901fbb3abab73400df27be16d2 100644 --- a/doc/mobile/cross_compiling_for_ios_cn.md +++ b/doc/mobile/cross_compiling_for_ios_cn.md @@ -1,4 +1,4 @@ -# 构建iOS平台上的PaddlePaddle库 +# iOS平台编译指南 交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。 ## 准备交叉编译环境 @@ -25,7 +25,7 @@ iOS平台可选配置参数: - `IOS_PLATFORM`,可设置为`OS/SIMULATOR`,默认值为`OS`。 - `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。 - `SIMULATOR`,构建目标为`x86`架构的模拟器平台。 -- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示: +- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示,默认编译所有架构: @@ -41,11 +41,11 @@ iOS平台可选配置参数: - + - +
OSarmv7, armv7s, arm64 (默认)armv7, armv7s, arm64
SIMULATORi386, x86_64 (默认)i386, x86_64
@@ -66,7 +66,7 @@ iOS平台可选配置参数: ```bash cmake -DCMAKE_SYSTEM_NAME=iOS \ -DIOS_PLATFORM=OS \ - -DIOS_ARCH="arm64" \ + -DIOS_ARCH="armv7;arm64" \ -DIOS_ENABLE_BITCODE=ON \ -DIOS_USE_VECLIB_FOR_BLAS=ON \ -DCMAKE_INSTALL_PREFIX=your/path/to/install \ @@ -112,6 +112,6 @@ $ make install - `lib`目录,其中包含PaddlePaddle的C-API静态库 - `third_party`目录,其中包含所依赖的所有第三方库 -注意,不同架构的PaddlePaddle库建议安装到不同的目录下,然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。 +注意,如果PaddlePaddle库需要同时支持真机和模拟器,则需要分别编译真机和模拟器版本,然后使用`lipo`工具合并fat库。 自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。 diff --git a/doc/mobile/cross_compiling_for_raspberry_cn.md b/doc/mobile/cross_compiling_for_raspberry_cn.md index 6e983645faaed1f67edaeeb82ddbef9cef6bb85f..f8ef9dc8031613831437745995268f3abc392f5b 100644 --- a/doc/mobile/cross_compiling_for_raspberry_cn.md +++ b/doc/mobile/cross_compiling_for_raspberry_cn.md @@ -1,4 +1,4 @@ -# 构建Raspberry Pi平台上的PaddlePaddle库 +# Raspberry Pi平台编译指南 通常有两个方法来构建基于 Rasspberry Pi 的版本: diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp index 78c43949dfe325d0e1a6ba10ae51cb7b858f6c52..bb8249a5511c089ec2f2263ff4cc290f0a5a8fce 100644 --- a/paddle/capi/Main.cpp +++ b/paddle/capi/Main.cpp @@ -29,6 +29,9 @@ static void initPaddle(int argc, char** argv) { extern "C" { paddle_error paddle_init(int argc, char** argv) { + static bool isInit = false; + if (isInit) return kPD_NO_ERROR; + std::vector realArgv; realArgv.reserve(argc + 1); realArgv.push_back(strdup("")); @@ -37,6 +40,7 @@ paddle_error paddle_init(int argc, char** argv) { } initPaddle(argc + 1, realArgv.data()); free(realArgv[0]); + isInit = true; return kPD_NO_ERROR; } } diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h index ede2670882ee2b93f610a2261a4ecc1784bc2d0c..4ab8de80d1c7be0f8e3eb848955373dd5e21bc18 100644 --- a/paddle/cuda/include/hl_gpu.h +++ b/paddle/cuda/include/hl_gpu.h @@ -25,7 +25,9 @@ limitations under the License. */ #include "hl_matrix.h" #include "hl_sequence.h" #include "hl_sparse.h" +#ifndef PADDLE_MOBILE_INFERENCE #include "hl_warpctc_wrap.h" +#endif #ifdef HPPL_STUB_FUNC #include "stub/hl_aggregate_stub.h" diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index b3b9c45ded95ce2e735b8898d47760956dcacdce..00d9dd238ec5328be28f58f8118daad3a039e08c 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -270,6 +270,19 @@ static bool AllGradInSet(const std::vector& names, return false; } } + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + sout << "All input {"; + for (auto& name : names) { + sout << name << ","; + } + sout << "} is in {"; + for (auto& name : set) { + sout << name << ","; + } + sout << "}"; + VLOG(10) << sout.str(); + } return true; } @@ -290,14 +303,12 @@ static void CreateGradVarInBlock( auto ops = block_desc->AllOps(); for (size_t op_index = grad_op_start_index; op_index < ops.size(); ++op_index) { - bool need_infer_shape = false; std::unordered_set new_vars; ForEachVarName(ops[op_index]->Outputs(), [&](const std::string& grad_var_name) { if (block_desc->HasVar(grad_var_name)) { return false; } - need_infer_shape = true; auto var = block_desc->Var(grad_var_name); new_vars.insert(var->Name()); auto it = param_name_map.find(grad_var_name); @@ -311,23 +322,21 @@ static void CreateGradVarInBlock( grad_record.op_idx_ = static_cast(op_index); return false; /* not break */ }); - if (need_infer_shape) { - ops[op_index]->InferVarType(block_desc); - for (auto& arg : ops[op_index]->OutputArgumentNames()) { - if (new_vars.find(arg) == new_vars.end()) { - continue; - } - auto pname = FwdName(arg); - auto* param = block_desc->FindVarRecursive(pname); - auto* grad = block_desc->FindVar(arg); - if (param == nullptr) { - grad->SetDataType(DataType::FP32); - } else { - grad->SetDataType(param->GetDataType()); - } + ops[op_index]->InferVarType(block_desc); + for (auto& arg : ops[op_index]->OutputArgumentNames()) { + if (new_vars.find(arg) == new_vars.end()) { + continue; + } + auto pname = FwdName(arg); + auto* param = block_desc->FindVarRecursive(pname); + auto* grad = block_desc->FindVar(arg); + if (param == nullptr) { + grad->SetDataType(DataType::FP32); + } else { + grad->SetDataType(param->GetDataType()); } - ops[op_index]->InferShape(*block_desc); } + ops[op_index]->InferShape(*block_desc); } } @@ -387,6 +396,7 @@ std::vector> MakeBlockBackward( ProgramDescBind& program_desc, int block_idx, std::unordered_set* no_grad_vars, std::unordered_map* grad_to_var) { + VLOG(5) << "MakeBlockBackward"; BlockDescBind* cur_block = program_desc.MutableBlock(block_idx); std::vector op_descs = cur_block->AllOps(); std::unordered_map> dup_out_ops; @@ -394,9 +404,10 @@ std::vector> MakeBlockBackward( std::vector> backward_descs; for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) { + VLOG(5) << "Making backward " << (*it)->Type() << " op"; std::vector> op_grads; - if ((*it)->Type() == "recurrent") { + if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") { int step_block_idx = (*it)->GetBlockAttr("step_block"); BlockDescBind* backward_block = CreateStepBlock( program_desc, no_grad_vars, grad_to_var, step_block_idx); @@ -410,6 +421,15 @@ std::vector> MakeBlockBackward( op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var); } + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + sout << "Made "; + for (auto& op_grad : op_grads) { + sout << op_grad->Type() << " "; + } + VLOG(10) << sout.str(); + } + for (const auto& desc : op_grads) { for (const std::string& out_name : desc->OutputArgumentNames()) { if (out_name.find("@GRAD") == std::string::npos) { @@ -425,6 +445,8 @@ std::vector> MakeBlockBackward( op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs), [](std::unique_ptr& ptr) { return std::move(ptr); }); } + + VLOG(5) << "Appending Sums"; // Check whether some variables are written more than once std::list>> pending_sum_ops; for (const auto& dup : dup_out_ops) { @@ -432,16 +454,22 @@ std::vector> MakeBlockBackward( const std::vector dup_op = dup.second; if (out_name != kEmptyVarName && dup_op.size() > 1) { std::vector sum_op_inputs; + std::string next_g_name = out_name; for (size_t i = 0; i < dup_op.size(); ++i) { + VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name + << " duplicated"; std::string new_name = out_name + "@RENAME@" + std::to_string(i); - backward_descs[dup_op[i]]->Rename(out_name, new_name); + backward_descs[dup_op[i]]->RenameOutput(out_name, new_name); + backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name); sum_op_inputs.emplace_back(new_name); + next_g_name = sum_op_inputs.back(); } std::unique_ptr sum_op(new OpDescBind( "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {})); pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)}); } } + pending_sum_ops.sort( [](const std::pair>& a, const std::pair>& b) { @@ -452,6 +480,8 @@ std::vector> MakeBlockBackward( std::move(p.second)); } + VLOG(5) << "MakeBlockBackward Finished"; + return backward_descs; } diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h index 3ec88d7a72c3339bf5e7d0ca3957a3f608f039b7..be144d8fc0104fccc08006532a85906ade25c2a1 100644 --- a/paddle/framework/data_type.h +++ b/paddle/framework/data_type.h @@ -29,6 +29,8 @@ inline DataType ToDataType(std::type_index type) { return DataType::INT32; } else if (typeid(int64_t).hash_code() == type.hash_code()) { return DataType::INT64; + } else if (typeid(bool).hash_code() == type.hash_code()) { + return DataType::BOOL; } else { PADDLE_THROW("Not supported"); } diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc index 53b899a23997b71e723a298ec360a4e018d89878..8b6f42b82df14bfcd25f33ef16b5903fb965a8ba 100644 --- a/paddle/framework/ddim.cc +++ b/paddle/framework/ddim.cc @@ -60,8 +60,7 @@ void make_ddim(DDim& ddim, const int64_t* dims, int n) { ddim = make_dim<9>(dims); break; default: - throw std::invalid_argument( - "Dynamic dimensions must have between [1, 9] dimensions."); + PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions."); } } diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 2fcf41d69f0011b0d9a3d89c97fcebacb0703e97..adedd8cb0e8504fd6fc924e62a2ede3c1c7ce698 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -120,6 +120,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id, for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); + VLOG(10) << op->DebugString(); op->Run(*local_scope, *device); } if (create_local_scope) { diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index 39c8def82e1ebb10a0e357a648af760099020c32..48cd131550dea5ad3f368b25c31d753efbe0dff9 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -235,6 +235,23 @@ void OpDescBind::Rename(const std::string &old_name, need_update_ = true; } +void OpDescBind::RenameOutput(const std::string &old_name, + const std::string &new_name) { + for (auto &output : outputs_) { + std::replace(output.second.begin(), output.second.end(), old_name, + new_name); + } + need_update_ = true; +} + +void OpDescBind::RenameInput(const std::string &old_name, + const std::string &new_name) { + for (auto &input : inputs_) { + std::replace(input.second.begin(), input.second.end(), old_name, new_name); + } + need_update_ = true; +} + struct SetAttrDescVisitor : public boost::static_visitor { explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {} mutable OpDesc::Attr *attr_; @@ -448,7 +465,12 @@ const std::vector &CompileTimeInferShapeContext::Outputs( DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const { auto var = block_.FindVarRecursive(name); PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); - return framework::make_ddim(var->Shape()); + try { + return framework::make_ddim(var->Shape()); + } catch (...) { + VLOG(5) << "GetDim of variable " << name << " error"; + std::rethrow_exception(std::current_exception()); + } } void CompileTimeInferShapeContext::SetDim(const std::string &name, diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h index e3e96441bbf51729f2ba69c9257e6961b1de0d5c..da032319afa775571d3942bf6ae415db7d233735 100644 --- a/paddle/framework/op_desc.h +++ b/paddle/framework/op_desc.h @@ -73,6 +73,10 @@ class OpDescBind { void Rename(const std::string &old_name, const std::string &new_name); + void RenameOutput(const std::string &old_name, const std::string &new_name); + + void RenameInput(const std::string &old_name, const std::string &new_name); + // Only be used in C++ const AttributeMap &GetAttrMap() const; diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 3276f8af396fe58450a8dc6713fe61e49d5ca708..93467ab8ac796277b47a861a427de2837fb2d3d4 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -403,19 +403,6 @@ class RuntimeInferShapeContext : public InferShapeContext { void OperatorWithKernel::Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const { - if (VLOG_IS_ON(1)) { - auto inputs = this->InputVars(); - auto outputs = this->OutputVars(true); - std::ostringstream sout; - sout << "Run operator " << this->Type() << " From ["; - std::ostream_iterator out_it(sout, ","); - std::copy(inputs.begin(), inputs.end(), out_it); - sout << "] to ["; - std::copy(outputs.begin(), outputs.end(), out_it); - sout << "]"; - VLOG(1) << sout.str(); - } - RuntimeInferShapeContext infer_shape_ctx(*this, scope); this->InferShape(&infer_shape_ctx); diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc index 9428b8a07ea0af005f6e960ddaa02da624ad9d97..9ad6272c99dd6a85520ae44c1331ac232bc6a9a2 100644 --- a/paddle/framework/scope.cc +++ b/paddle/framework/scope.cc @@ -38,11 +38,12 @@ Scope& Scope::NewScope() const { Variable* Scope::Var(const std::string& name) { auto iter = vars_.find(name); if (iter != vars_.end()) { + VLOG(3) << "Get existing variable " << name; return iter->second; } Variable* v = new Variable(); vars_[name] = v; - VLOG(3) << "Create variable " << name << " on scope"; + VLOG(3) << "Create variable " << name; v->name_ = &(vars_.find(name)->first); return v; } diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h index 7d36ead2ca85328c7843b3b5d423cf8e921d1c93..05dc47f06ac81f0acb6d0317cbecb3009c7dd7f0 100644 --- a/paddle/framework/shape_inference.h +++ b/paddle/framework/shape_inference.h @@ -53,6 +53,10 @@ class InferShapeContext { virtual bool IsRuntime() const = 0; + // Note: In while op, we need this to be public + void SetDims(const std::vector &names, + const std::vector &dims); + protected: virtual framework::DDim GetDim(const std::string &name) const = 0; virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0; @@ -60,9 +64,6 @@ class InferShapeContext { std::vector GetDims( const std::vector &names) const; - void SetDims(const std::vector &names, - const std::vector &dims); - std::vector GetVarTypes( const std::vector &names) const; diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt index 91d732641a4a5eed050841b59fd10da397eb732f..41ead3c5ecef248830cfb0f8be360f21dcd58e7b 100644 --- a/paddle/gserver/CMakeLists.txt +++ b/paddle/gserver/CMakeLists.txt @@ -73,7 +73,6 @@ if(MOBILE_INFERENCE) list(REMOVE_ITEM GSERVER_SOURCES dataproviders/DataProvider.cpp dataproviders/MultiDataProvider.cpp - dataproviders/ProtoDataProvider.cpp dataproviders/PyDataProvider2.cpp dataproviders/PyDataProvider.cpp) diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp index 0478256f9cd81f4a99eb0cbcbd1a5a21de5cf14b..106cf5b6228e636026ded558d0f591022f1ae586 100644 --- a/paddle/gserver/dataproviders/DataProvider.cpp +++ b/paddle/gserver/dataproviders/DataProvider.cpp @@ -16,8 +16,8 @@ limitations under the License. */ #include #include -#include "ProtoDataProvider.h" #include "paddle/utils/Logging.h" +#include "paddle/utils/Stat.h" #include "paddle/utils/StringUtil.h" #include "paddle/utils/Util.h" @@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config, REGISTER_DATA_PROVIDER(simple, SimpleDataProvider); REGISTER_DATA_PROVIDER(dummy, DummyDataProvider); -REGISTER_DATA_PROVIDER(proto, ProtoDataProvider); -REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider); int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) { int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch) diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp deleted file mode 100644 index c6f5cab1915b7f41d505c37a7fef762a392bad7f..0000000000000000000000000000000000000000 --- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp +++ /dev/null @@ -1,932 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "ProtoDataProvider.h" -#include -#include -#include -#include "paddle/utils/StringUtil.h" -#include "paddle/utils/Util.h" - -#include "DataProviderGroup.h" -#include "paddle/utils/Logging.h" - -DEFINE_double(memory_threshold_on_load_data, - 1.0, - "stop loading data when memory is not sufficient"); - -namespace paddle { - -REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup); -REGISTER_DATA_PROVIDER(proto_sequence_group, - DataProviderGroup); - -ProtoDataProvider::ProtoDataProvider(const DataConfig& config, - bool useGpu, - bool loadDataAll) - : DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) { - if (loadDataAll) { - loadData(config_.files()); - } -} - -void ProtoDataProvider::loadData(const std::vector& fileList) { - for (auto& file : fileList) { - if (FLAGS_memory_threshold_on_load_data < 1.0) { - double memUsage = getMemoryUsage(); - if (memUsage > FLAGS_memory_threshold_on_load_data) { - LOG(INFO) << "memUsage is " << memUsage << ", > " - << FLAGS_memory_threshold_on_load_data - << " therefore SKIP ALL REMAINING file."; - break; - } - } - LOG(INFO) << "load data file " << file; - loadDataFile(file); - } - - if (sequenceStartPositions_.size() == sampleNums_) { - // This means that each sample is one sequence - shuffledSequenceIds_.swap(sequenceStartPositions_); - } else { - sequenceStartPositions_.push_back(sampleNums_); - shuffledSequenceIds_.reserve(sequenceStartPositions_.size() - 1); - for (size_t i = 0; i < sequenceStartPositions_.size() - 1; ++i) { - shuffledSequenceIds_.push_back(i); - } - } - - LOG(INFO) << "read done, num of instance=" << sampleNums_; - showDataStats(); -} - -void ProtoDataProvider::loadData(const std::string& fileName) { - std::vector fileList; - loadFileList(fileName, fileList); - loadData(fileList); -} - -void ProtoDataProvider::checkDataHeader(const DataHeader& header) { - if (header_.slot_defs_size()) { - // header_ is already set. Need to check consistency. - CHECK_EQ(header_.slot_defs_size(), header.slot_defs_size()) - << "Different header"; - for (int i = 0; i < header.slot_defs_size(); ++i) { - CHECK_EQ(header_.slot_defs(i).type(), header.slot_defs(i).type()); - CHECK_EQ(header_.slot_defs(i).dim(), header.slot_defs(i).dim()); - } - return; - } - - // header_ is not set before - CHECK(header.slot_defs_size()) << "Invalid header: no slot is defined"; - int i; - for (i = 0; i < header.slot_defs_size(); ++i) { - if (header.slot_defs(i).type() == SlotDef::INDEX || - header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX) { - break; - } - constexpr int kBufLen = 100; - char buf[kBufLen]; - snprintf(buf, kBufLen, "slot%d_nnz", i); - nnzStats_.push_back(getStat(buf)); - } - numVecSlots_ = i; - - // Check that INDEX slots are after VECTOR slots - for (int i = numVecSlots_; i < header.slot_defs_size(); ++i) { - CHECK(header.slot_defs(i).type() == SlotDef::INDEX || - header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX); - } - - slots_.clear(); - slots_.reserve(header.slot_defs_size()); - for (int i = 0; i < header.slot_defs_size(); ++i) { - slots_.emplace_back(); - slots_.back().type = header.slot_defs(i).type(); - slots_.back().dim = header.slot_defs(i).dim(); - if (SlotDef::VECTOR_SPARSE_NON_VALUE == header.slot_defs(i).type() || - SlotDef::VECTOR_SPARSE_VALUE == header.slot_defs(i).type()) { - slots_.back().indices.push_back(0); - } - } - - header_ = header; -} - -void ProtoDataProvider::checkSample(const DataSample& sample) { - CHECK_EQ(numVecSlots_, sample.vector_slots_size()); - CHECK(header_.slot_defs_size() == numVecSlots_ + sample.id_slots_size() || - header_.slot_defs_size() == numVecSlots_ + sample.var_id_slots_size()); - for (int i = 0; i < numVecSlots_; ++i) { - uint32_t dim = header_.slot_defs(i).dim(); - switch (header_.slot_defs(i).type()) { - case SlotDef::VECTOR_DENSE: { - CHECK_EQ(static_cast(dim), sample.vector_slots(i).values_size()); - CHECK_EQ(0, sample.vector_slots(i).ids_size()); - break; - } - case SlotDef::VECTOR_SPARSE_NON_VALUE: { - if (0 == sample.vector_slots(i).ids_size()) { - break; - } - CHECK_LT(0, sample.vector_slots(i).ids_size()); - CHECK_EQ(0, sample.vector_slots(i).values_size()); - auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(), - sample.vector_slots(i).ids().end()); - CHECK_GT(dim, maxId); - break; - } - case SlotDef::VECTOR_SPARSE_VALUE: { - if (0 == sample.vector_slots(i).ids_size()) { - CHECK_EQ(0, sample.vector_slots(i).values_size()); - break; - } - CHECK_LT(0, sample.vector_slots(i).values_size()); - CHECK_GE(static_cast(dim), sample.vector_slots(i).values_size()); - CHECK_EQ(sample.vector_slots(i).values_size(), - sample.vector_slots(i).ids_size()); - auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(), - sample.vector_slots(i).ids().end()); - CHECK_GT(dim, maxId); - break; - } - case SlotDef::VAR_MDIM_DENSE: { - if (static_cast(dim) != 0) { - CHECK_EQ(static_cast(dim), sample.vector_slots(i).values_size()); - if (sample.vector_slots(i).dims_size() != 0) { - int totalDim = sample.vector_slots(i).dims(0); - for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) { - totalDim *= sample.vector_slots(i).dims(j); - } - CHECK_EQ(static_cast(dim), totalDim); - } - } else { - CHECK_NE(sample.vector_slots(i).dims_size(), 0); - int totalDim = sample.vector_slots(i).dims(0); - for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) { - totalDim *= sample.vector_slots(i).dims(j); - } - CHECK_EQ(totalDim, sample.vector_slots(i).values_size()); - } - break; - } - case SlotDef::STRING: { - CHECK_EQ(static_cast(1), sample.vector_slots(i).strs_size()); - CHECK_EQ(0, sample.vector_slots(i).ids_size()); - CHECK_EQ(0, sample.vector_slots(i).values_size()); - break; - } - default: - LOG(FATAL) << "BUG: Should not reach here"; - } - } - for (int i = numVecSlots_; i < header_.slot_defs_size(); ++i) { - if (header_.slot_defs(i).type() != SlotDef::VAR_MDIM_INDEX) { - uint32_t id = sample.id_slots(i - numVecSlots_); - if (id == -1U) continue; - CHECK_LT(id, header_.slot_defs(i).dim()); - } else { - for (int j = 0; j < sample.var_id_slots(i - numVecSlots_).ids_size(); - ++j) { - uint32_t id = sample.var_id_slots(i - numVecSlots_).ids(j); - CHECK_LT(id, header_.slot_defs(i).dim()); - } - } - } -} - -void ProtoDataProvider::loadDataFile(const std::string& fileName) { - std::ifstream is(fileName); - CHECK(is) << "Fail to open " << fileName; - bool dataCompression = str::endsWith(fileName, ".gz"); - std::unique_ptr reader(new ProtoReader(&is, dataCompression)); - CHECK(reader) << "Fail to create proto data input stream"; - - DataHeader header; - CHECK(reader->read(&header)); - checkDataHeader(header); - - DataSample sample; - do { - if (!reader->read(&sample)) { - break; - } - checkSample(sample); - if (sample.is_beginning()) { - sequenceStartPositions_.push_back(sampleNums_); - } - fillSlots(sample); - ++sampleNums_; - } while (true); - - CHECK(is.eof()) << "Fail to read file"; - reader.reset(nullptr); - is.close(); -} - -// checkSample has done before, no check here -void ProtoDataProvider::fillSlots(const DataSample& sample) { - for (size_t i = 0; i < slots_.size(); ++i) { - auto& slot = slots_[i]; - int dim = slot.dim; - switch (slot.type) { - case SlotDef::VECTOR_DENSE: { - size_t oldSize = slot.denseData.size(); - slot.denseData.resize(oldSize + dim); - const float* values = sample.vector_slots(i).values().data(); -#ifdef PADDLE_TYPE_DOUBLE - std::copy(values, values + dim, slot.denseData.begin() + oldSize); -#else - memcpy(slot.denseData.data() + oldSize, values, sizeof(real) * dim); -#endif - break; - } - case SlotDef::VECTOR_SPARSE_NON_VALUE: { - int slotSize = sample.vector_slots(i).ids_size(); - int subSlotSize = 0; - int id = 0; // the slot id - // find whether this vector_slots has subseq. If not has subseq, - // subSlotSize = 0. - for (id = 0; id < sample.subseq_slots_size(); id++) { - if (sample.subseq_slots(id).slot_id() == i) { - subSlotSize = sample.subseq_slots(id).lens_size(); - break; - } - } - if (subSlotSize && slot.subIndices.size() == 0UL) { - // If has subSeq, the first element of subIndices = 0. - slot.subIndices.push_back(0); - } - if (slotSize == 0UL) { - // if has no id, new indices = old indices. - slot.indices.push_back(slot.indices.back()); - // if has subSeq, new subIndices = old subIndices. - if (slot.subIndices.size()) { - slot.subIndices.push_back(slot.subIndices.back()); - } - break; - } - slot.sparseNonValueData.resize(slot.indices.back() + slotSize); - const unsigned int* ids = sample.vector_slots(i).ids().data(); - memcpy(slot.sparseNonValueData.data() + slot.indices.back(), - ids, - sizeof(*ids) * slotSize); - slot.indices.push_back(slot.indices.back() + slotSize); - if (subSlotSize) { - for (int ii = 0; ii < subSlotSize; ++ii) { - slot.subIndices.push_back(slot.subIndices.back() + - sample.subseq_slots(id).lens(ii)); - } - } - break; - } - case SlotDef::VECTOR_SPARSE_VALUE: { - if (0 == sample.vector_slots(i).ids_size()) { - slot.indices.push_back(slot.indices.back()); - break; - } - int slotSize = sample.vector_slots(i).ids_size(); - slot.sparseFloatValueData.resize(slot.indices.back() + slotSize); - const unsigned int* ids = sample.vector_slots(i).ids().data(); - const float* values = sample.vector_slots(i).values().data(); - for (int ii = 0; ii < slotSize; ++ii) { - slot.sparseFloatValueData[slot.indices.back() + ii].col = ids[ii]; - slot.sparseFloatValueData[slot.indices.back() + ii].value = - values[ii]; - } - slot.indices.push_back(slot.indices.back() + slotSize); - break; - } - case SlotDef::INDEX: { - slot.indexData.push_back(sample.id_slots(i - numVecSlots_)); - break; - } - case SlotDef::VAR_MDIM_DENSE: { - size_t oldSize = slot.varDenseData.size(); - slot.varDenseData.resize(oldSize + 1); - size_t varDim = sample.vector_slots(i).values_size(); - slot.varDenseData[oldSize].data.resize(varDim); - const float* values = sample.vector_slots(i).values().data(); -#ifdef PADDLE_TYPE_DOUBLE - std::copy( - values, values + varDim, slot.varDenseData[oldSize].data.data()); -#else - memcpy(slot.varDenseData[oldSize].data.data(), - values, - sizeof(real) * varDim); -#endif - slot.varDenseData[oldSize].dims.resize( - sample.vector_slots(i).dims_size()); - memcpy(slot.varDenseData[oldSize].dims.data(), - sample.vector_slots(i).dims().data(), - sizeof(uint32_t) * sample.vector_slots(i).dims_size()); - break; - } - case SlotDef::VAR_MDIM_INDEX: { - size_t oldSize = slot.varIndices.size(); - slot.varIndices.resize(oldSize + 1); - size_t varDim = sample.var_id_slots(i - numVecSlots_).ids_size(); - slot.varIndices[oldSize].resize(varDim); - memcpy(slot.varIndices[oldSize].data(), - sample.var_id_slots(i - numVecSlots_).ids().data(), - sizeof(uint32_t) * varDim); - break; - } - case SlotDef::STRING: { - slot.strData.push_back(sample.vector_slots(i).strs(0)); - break; - } - } - } -} - -void ProtoDataProvider::showDataStats() { - std::ostringstream oss; - for (size_t i = 0; i < slots_.size(); ++i) { - auto& slot = slots_[i]; - if (slot.type == SlotDef::VECTOR_SPARSE_NON_VALUE) { - size_t nnz = slot.sparseNonValueData.size(); - oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; "; - } else if (slot.type == SlotDef::VECTOR_SPARSE_VALUE) { - size_t nnz = slot.sparseFloatValueData.size(); - oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; "; - } - } - LOG(INFO) << oss.str(); -} - -void ProtoDataProvider::reset() { - currentSequenceIndex_ = 0; - if (!skipShuffle_) { - shuffle(); - } - - DataProvider::reset(); -} - -void ProtoDataProvider::shuffle() { - std::shuffle(shuffledSequenceIds_.begin(), - shuffledSequenceIds_.end(), - ThreadLocalRandomEngine::get()); -} - -/* - Loop through sequences starting from currentSequenceIndex_ - for at most size samples. For each sequence ranging from [begin, end), - op(begin, end) will be called. - - return the number of sequences scanned -*/ -template -int64_t ProtoDataProvider::sequenceLoop(Op op, int64_t size) { - int64_t sz = 0; - size_t i; - size_t sequenceCount = shuffledSequenceIds_.size(); - if (usageRatio_ < 1.0f) { - sequenceCount = static_cast(sequenceCount * usageRatio_); - } - for (i = currentSequenceIndex_; i < sequenceCount; ++i) { - size_t id = shuffledSequenceIds_[i]; - int64_t begin = sequenceStartPositions_[id]; - int64_t end = sequenceStartPositions_[id + 1]; - int64_t len = end - begin; - if (sz + len > size && sz > 0) break; - sz += len; - op(begin, end); - } - return i - currentSequenceIndex_; -} - -/* - Loop through sequences starting from currentSequenceIndex_ - for at most size samples. For each sample of each sequence at position - pos, op(pos) will be called. - - return the number of sequences scanned -*/ -template -int64_t ProtoDataProvider::sampleLoop(Op op, int64_t size) { - if (iidData()) { - size = std::min(sampleNums_ - currentSequenceIndex_, size); - for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size; - ++i) { - size_t pos = shuffledSequenceIds_[i]; - op(pos); - } - return size; - } else { - auto f = [op](int64_t begin, int64_t end) { - for (int64_t pos = begin; pos < end; ++pos) { - op(pos); - } - }; - return sequenceLoop(f, size); - } -} - -/* - Loop through sub-sequences starting from currentSequenceIndex_ - for at most size samples. For each sample of each sub-sequence at position - pos, op(pos) will be called. - - return the number of sub-sequences scanned -*/ -template -int64_t ProtoDataProvider::subSampleLoop(Op op, int64_t size, int slot) { - CHECK(iidData()) << "subSampleLoop only accepts iid data"; - size = std::min(sampleNums_ - currentSequenceIndex_, size); - int subSize = 0; - for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size; - ++i) { - size_t pos = shuffledSequenceIds_[i]; - int64_t* indexs = slots_[slot].indices.data(); - int64_t* subIndexs = slots_[slot].subIndices.data(); - int64_t subSeqStart = 0; - int64_t subSeqEnd = 0; - for (int j = 0; j < (int)slots_[slot].subIndices.size(); j++) { - if (subIndexs[j] == indexs[pos]) { - subSeqStart = j; - if (subIndexs[pos] == subIndexs[pos + 1]) { - subSeqEnd = j + 1; - break; - } - } else if (subIndexs[j] == indexs[pos + 1]) { - subSeqEnd = j; - break; - } - } - for (int j = subSeqStart; j < subSeqEnd; j++) { - op(j); - } - subSize += subSeqEnd - subSeqStart; - } - return subSize; -} - -int64_t ProtoDataProvider::getNextBatchInternal(int64_t size, - DataBatch* batch) { - int64_t numSequences = 0; // actual number of sequences in the batch - - // the number of sequences scanned, including those skipped because too long - int64_t numScannedSeqs = 0; - std::lock_guard guard(lock_); - if (iidData()) { - size = std::min(getSize() - currentSequenceIndex_, size); - numScannedSeqs = numSequences = size; - } else { - int64_t sz = 0; - auto op = [&sz, &numSequences](int64_t begin, int64_t end) { - ++numSequences; - sz += end - begin; - }; - numScannedSeqs = sequenceLoop(op, size); - VLOG_IF(1, numScannedSeqs > numSequences) - << numScannedSeqs - numSequences - << " sequences are skipped because longer than " << size; - size = sz; - } - if (size <= 0) return 0; - - DataBatch& cpuBatch = *cpuBatch_; - std::vector& cpuArguments = cpuBatch.getStreams(); - cpuBatch.setSize(size); - cpuArguments.resize(header_.slot_defs_size()); - - if (!iidData()) { - ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions, - numSequences + 1, - /* useGpu= */ false); - int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false); - int pos = 0; - int i = 0; - auto op = [buf, &pos, &i](int64_t begin, int64_t end) { - buf[i] = pos; - pos += end - begin; - ++i; - }; - sequenceLoop(op, size); - buf[i] = size; - for (size_t slot = 1; slot < cpuArguments.size(); ++slot) { - cpuArguments[slot].sequenceStartPositions = - cpuArguments[0].sequenceStartPositions; - } - } - - for (int slot = 0; slot < header_.slot_defs_size(); ++slot) { - size_t dim = header_.slot_defs(slot).dim(); - SlotDef::SlotType slotType = header_.slot_defs(slot).type(); - - std::vector dataPos; - dataPos.reserve(size); - auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); }; - sampleLoop(op, size); - - switch (slotType) { - case SlotDef::VECTOR_DENSE: { - Matrix::resizeOrCreate(cpuArguments[slot].value, - size, - dim, - false, // trans = false - false); // useGpu = false - real* buf = cpuArguments[slot].value->getData(); - for (int i = 0; i < size; ++i) { - memcpy(buf + i * dim, - slots_[slot].denseData.data() + dataPos[i] * dim, - sizeof(real) * dim); - } - break; - } - case SlotDef::VECTOR_SPARSE_NON_VALUE: { - if (!(cpuArguments[slot].value)) { - cpuArguments[slot].value = - Matrix::createSparseMatrix(size, - dim, - size /*DEFAULT_AVG_WIDTH = 1*/, - NO_VALUE, - SPARSE_CSR, - false, - useGpu_); - } - auto mat = cpuArguments[slot].value; - mat->resize(size, dim); - if (std::dynamic_pointer_cast(mat)) { - std::dynamic_pointer_cast(mat)->copyFrom( - dataPos.data(), - slots_[slot].indices.data(), - slots_[slot].sparseNonValueData.data(), - HPPL_STREAM_1); - } else if (std::dynamic_pointer_cast(mat)) { - std::dynamic_pointer_cast(mat)->copyFrom( - dataPos.data(), - slots_[slot].indices.data(), - slots_[slot].sparseNonValueData.data()); - } else { - LOG(FATAL) << "Not Supported"; - } - size_t numElements = 0; - for (auto pos : dataPos) { - numElements += - slots_[slot].indices[pos + 1] - slots_[slot].indices[pos]; - } - nnzStats_[slot]->addSample(numElements); - - break; - } - case SlotDef::VECTOR_SPARSE_VALUE: { - if (!(cpuArguments[slot].value)) { - cpuArguments[slot].value = - Matrix::createSparseMatrix(size, - dim, - size /*DEFAULT_AVG_WIDTH = 1*/, - FLOAT_VALUE, - SPARSE_CSR, - false, - useGpu_); - } - auto mat = cpuArguments[slot].value; - mat->resize(size, dim); - if (std::dynamic_pointer_cast(mat)) { - std::dynamic_pointer_cast(mat)->copyFrom( - dataPos.data(), - slots_[slot].indices.data(), - slots_[slot].sparseFloatValueData.data(), - HPPL_STREAM_1); - } else if (std::dynamic_pointer_cast(mat)) { - std::dynamic_pointer_cast(mat)->copyFrom( - dataPos.data(), - slots_[slot].indices.data(), - slots_[slot].sparseFloatValueData.data()); - } else { - LOG(FATAL) << "Not Supported"; - } - break; - } - case SlotDef::INDEX: { - IVector::resizeOrCreate(cpuArguments[slot].ids, - size, - /* useGpu= */ false); - int* buf = cpuArguments[slot].ids->getData(); - for (int i = 0; i < size; ++i) { - buf[i] = slots_[slot].indexData[dataPos[i]]; - } - break; - } - case SlotDef::VAR_MDIM_DENSE: { - CHECK_EQ(size, 1); - auto mat = cpuArguments[slot].value; - size_t totalDim = slots_[slot].varDenseData[dataPos[0]].data.size(); - - CHECK_EQ(slots_[slot].varDenseData[dataPos[0]].dims.size(), size_t(3)); - size_t height, width, depth, oldWidth; - /* dims[2] is depth, will be changed to dims[0] in future */ - depth = slots_[slot].varDenseData[dataPos[0]].dims[2]; - height = slots_[slot].varDenseData[dataPos[0]].dims[1]; - width = slots_[slot].varDenseData[dataPos[0]].dims[0]; - oldWidth = width; - /* process the undesirable sample */ - if (oldWidth < height) { - width = height; - } - cpuArguments[slot].setFrameHeight(height); - cpuArguments[slot].setFrameWidth(width); - - if (oldWidth < height) { - totalDim = width * height * depth; - } - Matrix::resizeOrCreate(cpuArguments[slot].value, - size, - totalDim, - false, // trans = false - false); // useGpu = false - real* buf = cpuArguments[slot].value->getData(); - cpuArguments[slot].value->zeroMem(); - if (oldWidth < height) { - real* srcBuf = slots_[slot].varDenseData[dataPos[0]].data.data(); - for (size_t i = 0; i < depth; i++) { - for (size_t j = 0; j < height; j++) { - for (size_t k = 0; k < oldWidth; k++) { - buf[i * height * width + j * width + k] = - srcBuf[i * height * oldWidth + j * oldWidth + k]; - } - } - } - } else { - memcpy(buf, - slots_[slot].varDenseData[dataPos[0]].data.data(), - sizeof(real) * totalDim); - } - ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions, - size + 1, /* size == 1 currently */ - /* useGpu= */ false); - int* bufStarts = - cpuArguments[slot].sequenceStartPositions->getMutableData(false); - bufStarts[0] = 0; - bufStarts[1] = 1; - break; - } - case SlotDef::VAR_MDIM_INDEX: { - CHECK_EQ(size, 1); - size_t totalDim = slots_[slot].varIndices[dataPos[0]].size(); - IVector::resizeOrCreate(cpuArguments[slot].ids, - totalDim, - /* useGpu= */ false); - int* buf = cpuArguments[slot].ids->getData(); - memcpy(buf, - slots_[slot].varIndices[dataPos[0]].data(), - sizeof(int) * totalDim); - - ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions, - size + 1, /* size == 1 currently */ - /* useGpu= */ false); - int* bufStarts = - cpuArguments[slot].sequenceStartPositions->getMutableData(false); - bufStarts[0] = 0; - /* we expand the convolutinal feature map to a sequence data, - * so there should be a corresponding sequence labels */ - bufStarts[1] = totalDim; - break; - } - case SlotDef::STRING: { - if (cpuArguments[slot].strs) { - cpuArguments[slot].strs->resize(size); - } else { - cpuArguments[slot].strs = - std::make_shared>(size); - } - for (int i = 0; i < size; ++i) { - (*cpuArguments[slot].strs)[i] = slots_[slot].strData[dataPos[i]]; - } - break; - } - } - } - - if (useGpu_) { - std::vector& cpuArguments = cpuBatch.getStreams(); - DataBatch& gpuBatch = *gpuBatch_; - std::vector& gpuArguments = gpuBatch.getStreams(); - gpuArguments.resize(cpuArguments.size()); - gpuBatch.setSize(size); - for (int i = 0; i < header_.slot_defs_size(); ++i) { - SlotDef::SlotType slotType = header_.slot_defs(i).type(); - if (SlotDef::VECTOR_SPARSE_VALUE == slotType || - SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) { - gpuArguments[i] = cpuArguments[i]; - gpuArguments[i].sequenceStartPositions = - cpuArguments[i].sequenceStartPositions; - } else { - gpuArguments[i].resizeAndCopyFrom( - cpuArguments[i], useGpu_, HPPL_STREAM_1); - } - } - hl_stream_synchronize(HPPL_STREAM_1); - *batch = gpuBatch; - } else { - *batch = cpuBatch; - } - - currentSequenceIndex_ += numScannedSeqs; - - return batch->getSize(); -} - -ProtoSequenceDataProvider::ProtoSequenceDataProvider(const DataConfig& config, - bool useGpu, - bool loadDataAll) - : ProtoDataProvider(config, useGpu, loadDataAll) {} - -int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size, - DataBatch* batch) { - CHECK(iidData()) << "ProtoSequenceDataProvider only accepts iid data"; - int64_t numSequences = 0; // actual number of sequences in the batch - - // the number of sequences scanned, including those skipped because too long - int64_t numScannedSeqs = 0; - std::lock_guard guard(lock_); - size = std::min(getSize() - currentSequenceIndex_, size); - numScannedSeqs = numSequences = size; - if (size <= 0) return 0; - - DataBatch& cpuBatch = *cpuBatch_; - std::vector& cpuArguments = cpuBatch.getStreams(); - cpuBatch.setSize(size); - cpuArguments.resize(header_.slot_defs_size()); - - for (int slot = 0; slot < header_.slot_defs_size(); ++slot) { - SlotDef::SlotType slotType = header_.slot_defs(slot).type(); - - std::vector dataPos; - dataPos.reserve(size); - auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); }; - sampleLoop(op, size); - - // current slot: sequenceStartPositions - ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions, - size + 1, - /* useGpu= */ false); - - switch (slotType) { - case SlotDef::VECTOR_SPARSE_VALUE: - case SlotDef::VAR_MDIM_DENSE: - case SlotDef::VAR_MDIM_INDEX: { - LOG(FATAL) << "ProtoSequenceDataProvider only support" - << " VECTOR_DENSE, VECTOR_SPARSE_NON_VALUE and INDEX slots"; - break; - } - case SlotDef::VECTOR_SPARSE_NON_VALUE: { - // copy to IDS, not value - // pointers used in current slot - sparse_non_value_t* data = slots_[slot].sparseNonValueData.data(); - int64_t* indexs = slots_[slot].indices.data(); - int64_t* seqs = dataPos.data(); - - // current slot: i need size instances. what is the total length? - int totalFeatureInCurrentSlot = 0; - for (int ins = 0; ins < size; ins++) { - int64_t currInsId = seqs[ins]; - totalFeatureInCurrentSlot += - indexs[currInsId + 1] - indexs[currInsId]; - // special: if current instance has NO feature in current slot - if (indexs[currInsId + 1] == indexs[currInsId]) { - totalFeatureInCurrentSlot++; - } - } - // done - - // current slot: ids - IVector::resizeOrCreate(cpuArguments[slot].ids, - totalFeatureInCurrentSlot, - /* useGpu= */ false); - - // where to write - int* currPosOfArgumentId = cpuArguments[slot].ids->getData(); - int* currPosOfArgumentSeqStart = - cpuArguments[slot].sequenceStartPositions->getMutableData(false); - int allSequenceLength = 0; - currPosOfArgumentSeqStart[0] = 0; - // for each instance, copy data and fill sequence positions - for (int instance = 0; instance < size; instance++) { - int64_t currInstanceId = seqs[instance]; - int64_t currInstanceLength = - indexs[currInstanceId + 1] - indexs[currInstanceId]; - sparse_non_value_t* currInstanceData = data + indexs[currInstanceId]; - // write sequenceStartPositions - allSequenceLength += currInstanceLength; - currPosOfArgumentSeqStart[instance + 1] = allSequenceLength; - // copy features - for (int featCopier = 0; featCopier < currInstanceLength; - featCopier++) { - currPosOfArgumentId[featCopier] = currInstanceData[featCopier].col; - } - currPosOfArgumentId += currInstanceLength; - // special: if current instance has NO feature in current slot - if (currInstanceLength == 0) { - allSequenceLength++; - currPosOfArgumentSeqStart[instance + 1] = allSequenceLength; - currPosOfArgumentId[0] = -1; - currPosOfArgumentId++; - } - // done - } - if (slots_[slot].subIndices.size()) { - std::vector dataSubPos; - auto op = [this, &dataSubPos](int64_t pos) { - dataSubPos.push_back(pos); - }; - int subSize = subSampleLoop(op, size, slot); - ICpuGpuVector::resizeOrCreate( - cpuArguments[slot].subSequenceStartPositions, subSize + 1, false); - int* currPosOfArgumentSubSeqStart = - cpuArguments[slot].subSequenceStartPositions->getMutableData( - false); - int64_t* subSeqs = dataSubPos.data(); - int64_t* subIndexs = slots_[slot].subIndices.data(); - int allSubSequenceLength = 0; - currPosOfArgumentSubSeqStart[0] = 0; - // for each instance, compute sub-sequence number - for (int instance = 0; instance < subSize; instance++) { - int64_t currSubInstanceId = subSeqs[instance]; - int64_t currSubInstanceLength = - subIndexs[currSubInstanceId + 1] - subIndexs[currSubInstanceId]; - // write subSequenceStartPositions - allSubSequenceLength += currSubInstanceLength; - currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength; - // special: if current instance has NO feature in current slot - if (currSubInstanceLength == 0) { - allSubSequenceLength++; - currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength; - } - } - cpuArguments[slot].checkSubset(); - } - break; - } - case SlotDef::INDEX: { - // label slot - IVector::resizeOrCreate(cpuArguments[slot].ids, - size, - /* useGpu= */ false); - // fill labels - int* buf = cpuArguments[slot].ids->getData(); - for (int i = 0; i < size; ++i) { - buf[i] = slots_[slot].indexData[dataPos[i]]; - } - // label HAS sequence structure - cpuArguments[slot].sequenceStartPositions->fillSequence(false); - break; - } - case SlotDef::VECTOR_DENSE: { - // copy values - size_t dim = header_.slot_defs(slot).dim(); - Matrix::resizeOrCreate(cpuArguments[slot].value, - size, - dim, - false, // trans = false - false); // useGpu = false - real* buf = cpuArguments[slot].value->getData(); - for (int i = 0; i < size; ++i) { - memcpy(buf + i * dim, - slots_[slot].denseData.data() + dataPos[i] * dim, - sizeof(real) * dim); - } - // sequence structure - cpuArguments[slot].sequenceStartPositions->fillSequence(false); - break; - } - default: { LOG(FATAL) << "should not reach here"; } - } - } - - if (useGpu_) { - std::vector& cpuArguments = cpuBatch.getStreams(); - DataBatch& gpuBatch = *gpuBatch_; - std::vector& gpuArguments = gpuBatch.getStreams(); - gpuArguments.resize(cpuArguments.size()); - gpuBatch.setSize(size); - for (size_t i = 0; i < cpuArguments.size(); ++i) { - gpuArguments[i].resizeAndCopyFrom( - cpuArguments[i], useGpu_, HPPL_STREAM_1); - } - hl_stream_synchronize(HPPL_STREAM_1); - *batch = gpuBatch; - } else { - *batch = cpuBatch; - } - - currentSequenceIndex_ += numScannedSeqs; - return batch->getSize(); -} - -} // namespace paddle diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h deleted file mode 100644 index 7dd45e062248f20d24c633dd4e1c8b7eebcbfa1b..0000000000000000000000000000000000000000 --- a/paddle/gserver/dataproviders/ProtoDataProvider.h +++ /dev/null @@ -1,179 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "DataFormat.pb.h" -#include "paddle/utils/Stat.h" - -#include "DataProvider.h" -#include "ProtoReader.h" - -namespace paddle { - -/** - * @brief Provider data from protobuf data file with each sample - * specified by proto message - * - * DataSample defined in DataFormat.proto. - * - * The file format is - * - * header - * - * sample1 - * - * sample2 - * - * ... - * - * sampleN - * - * @note: In the data file, each message is prefixed with its length. - * The read/write of the protbuf are implemented in ProtoReader.h - */ -class ProtoDataProvider : public DataProvider { -public: - ProtoDataProvider(const DataConfig& config, - bool useGpu, - bool loadDataAll = true); - virtual void reset(); - - /** - * @note this size includes the sequences which are skipped because they - * are longer than the batch size. - */ - virtual int64_t getSize() { - int64_t size = sampleNums_; - if (usageRatio_ < 1.0f) { - size = static_cast(size * usageRatio_); - } - return size; - } - virtual void shuffle(); - - void loadData(const std::vector& fileList); - - virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch); - -protected: - /** - * @brief load protobuf data from a list of file - * @param[in] fileName file name of a file which contains - * a list of file names - */ - void loadData(const std::string& fileName); - - /** - * @brief load protobuf data from file - * @param[in] fileName data file name - */ - void loadDataFile(const std::string& fileName); - /** @brief check data header of each data sample - * @param[in] header data header read from protobuf data - */ - void checkDataHeader(const DataHeader& header); - /** - * @brief fill protobuf data into slot_, - * slot_ is a vector of ProtoSlot in memory. - * @param[in] sample data sample read from protobuf data - */ - void fillSlots(const DataSample& sample); - - /** - * @brief return true if each sample is one sequence, i.e., independent - * of other samples. - */ - inline bool iidData() const { return sequenceStartPositions_.empty(); } - - /** - * @brief check that sample is consistent with header_ - */ - void checkSample(const DataSample& sample); - - template - int64_t sequenceLoop(Op op, int64_t size); - - template - int64_t sampleLoop(Op op, int64_t size); - - template - int64_t subSampleLoop(Op op, int64_t size, int slot); - - void showDataStats(); - -protected: - struct ProtoVarSlot { - std::vector data; - std::vector dims; - }; - - struct ProtoSlot { - SlotDef::SlotType type; - int dim; - std::vector indexData; - std::vector denseData; - std::vector sparseNonValueData; - std::vector sparseFloatValueData; - std::vector indices; - std::vector subIndices; - - std::vector varDenseData; - std::vector> varIndices; - std::vector strData; - }; - DataHeader header_; - int numVecSlots_; - - std::vector slots_; - size_t sampleNums_; - - /** - * The starting position of each sequence in samples. - * The last element should be num of samples. - * If empty, each sample is one sequence. - */ - std::vector sequenceStartPositions_; - - int64_t currentSequenceIndex_; - - // The size should be the number of sequences. - std::vector shuffledSequenceIds_; - - ThreadLocalD cpuBatch_; - ThreadLocalD gpuBatch_; - - RWLock lock_; - std::vector nnzStats_; // stats for number of none-zeros entries -}; - -/** - * @brief Special use for Proto data: instances should contain sparse-non-value - * slots - * and label. - * - * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE - */ -class ProtoSequenceDataProvider : public ProtoDataProvider { -public: - ProtoSequenceDataProvider(const DataConfig& config, - bool useGpu, - bool loadDataAll = true); - ~ProtoSequenceDataProvider() {} - virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch); -}; - -} // namespace paddle diff --git a/paddle/gserver/layers/DotProdLayer.cpp b/paddle/gserver/layers/DotProdLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9e2dbe3c3c416f606d2938701f26288642b55267 --- /dev/null +++ b/paddle/gserver/layers/DotProdLayer.cpp @@ -0,0 +1,97 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Layer.h" +#include "paddle/math/Matrix.h" +#include "paddle/utils/Logging.h" +#include "paddle/utils/Stat.h" + +namespace paddle { + +/** + * @brief A layer for computing the dot product of two vectors. + * Input1: vector (batchSize * dim) + * Input2: vector (batchSize * dim) + * Output: a matrix: (batchSize * 1) + */ + +class DotProdLayer : public Layer { +public: + explicit DotProdLayer(const LayerConfig& config) : Layer(config) {} + + ~DotProdLayer() {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + void backward(const UpdateCallback& callback = nullptr) override; +}; + +REGISTER_LAYER(dot_prod, DotProdLayer); + +bool DotProdLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + + CHECK_EQ(inputLayers_.size(), 2U); + CHECK_EQ(1UL, getSize()) + << "The output dimensionality of this layer should be fixed to 1."; + + return true; +} + +void DotProdLayer::forward(PassType passType) { + Layer::forward(passType); + + MatrixPtr inV0 = getInputValue(0); + MatrixPtr inV1 = getInputValue(1); + + size_t batchSize = inV0->getHeight(); + CHECK_EQ(inV1->getHeight(), batchSize); + CHECK_EQ(inV0->getWidth(), inV1->getWidth()); + + { + REGISTER_TIMER_INFO("FwResetTimer", getName().c_str()); + reserveOutput(batchSize, 1); + } + + MatrixPtr outV = getOutputValue(); + { + REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str()); + outV->sumOfProducts(*inV0, *inV1, 1, 0); + } +} + +void DotProdLayer::backward(const UpdateCallback& callback) { + MatrixPtr inV0 = getInputValue(0); + MatrixPtr inV1 = getInputValue(1); + MatrixPtr outG = getOutputGrad(); + MatrixPtr inG0 = getInputGrad(0); + MatrixPtr inG1 = getInputGrad(1); + + { + REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str()); + + if (inG0) { + inG0->addRowScale(0, *inV1, *outG); + } + + if (inG1) { + inG1->addRowScale(0, *inV0, *outG); + } + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.cpp b/paddle/gserver/layers/MKLDNNConcatLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c9099297cc5c741fbae0b42f21b988e6c561ef11 --- /dev/null +++ b/paddle/gserver/layers/MKLDNNConcatLayer.cpp @@ -0,0 +1,202 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "MKLDNNConcatLayer.h" + +using namespace mkldnn; // NOLINT +typedef memory::format format; + +namespace paddle { + +REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer); + +bool MKLDNNConcatLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + if (!MKLDNNLayer::init(layerMap, parameterMap)) { + return false; + } + CHECK_GT(inputLayers_.size(), 1UL); + CHECK(!biasParameter_); + return true; +} + +void MKLDNNConcatLayer::reshape( + int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) { + reshapeInput(bs, ih, iw); + ic = inputLayers_[0]->getSize() / ih / iw; + CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize()); + CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw); + CHECK_GT(inputLayers_.size(), 1UL); + channels_.resize(inputLayers_.size()); + channels_[0] = ic; + // need change the output channel, so use oc_ instead + // TODO(TJ): change API, use &oc + oc_ = ic; + for (size_t i = 1; i < inputLayers_.size(); i++) { + int batchsize, height, witdh; + reshapeInput(batchsize, height, witdh, i); + CHECK_EQ(bs, batchsize); + CHECK_EQ(ih, height); + CHECK_EQ(iw, witdh); + + channels_[i] = inputLayers_[i]->getSize() / height / witdh; + CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize()); + oc_ += channels_[i]; + } + oh = ih; + ow = iw; + reshapeOutput(oh, ow); + resizeOutput(bs, oc_ * oh * ow); +} + +void MKLDNNConcatLayer::resetFwd(std::vector& pipeline, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& wgt, + MKLDNNMatrixPtr& bias, + MKLDNNMatrixPtr& out) { + resetFwdBuffers(inVals_, out); + in = inVals_[0]; + + std::shared_ptr fwdPD; + resetFwdPD(fwdPD, inVals_, out); + + resetFwdPipeline(pipeline, fwdPD, inVals_, out); +} + +void MKLDNNConcatLayer::resetBwd(std::vector& pipeline, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& wgt, + MKLDNNMatrixPtr& bias, + MKLDNNMatrixPtr& out) { + resetBwdBuffers(inGrads_, out); + in = inGrads_[0]; + + resetBwdPipeline(pipeline, bwds_, inGrads_, out); +} + +void MKLDNNConcatLayer::resetFwdBuffers(std::vector& inputs, + MKLDNNMatrixPtr& out) { + inputs.resize(inputLayers_.size()); + bool has8c = false, has16c = false, hasnc = false; + for (size_t i = 0; i < inputs.size(); i++) { + // resetInValue will use ic_ so temporary change as current input's channel + // TODO(TJ): change ic_ as vector then can remove channels_ + ic_ = channels_[i]; + resetInValue(inputs[i], nullptr, i); + CHECK(inputs[i]); + auto dm = inputs[i]->getDims(); + // inputs format can be different, but ndims must equal + CHECK(i == 0 || dm.size() == inputs[0]->getDims().size()); + CHECK_EQ(bs_, dm[0]); + CHECK_EQ(channels_[i], dm[1]); + if (dm.size() > 2) { + CHECK_EQ(ih_, dm[2]); + CHECK_EQ(iw_, dm[3]); + } + if (inputs[i]->getFormat() == format::nc) { + hasnc = true; + } + if (inputs[i]->getFormat() == format::nChw8c) { + has8c = true; + } + if (inputs[i]->getFormat() == format::nChw16c) { + has16c = true; + } + } + // change back, ic_ always save the input 0 size + ic_ = channels_[0]; + + format outFmt; + if (has16c && oc_ % 16 == 0) { + outFmt = format::nChw16c; + } else if (has8c && oc_ % 8 == 0) { + outFmt = format::nChw8c; + } else if (hasnc) { + CHECK(oh_ == 1 && ow_ == 1); + outFmt = format::nc; + } else { + outFmt = format::nchw; + } + memory::dims outDims = + hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_}; + auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_); + resetOutValue(out, outPD); +} + +void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr& pd, + std::vector& inputs, + MKLDNNMatrixPtr out) { + std::vector srcPDs; + for (size_t i = 0; i < inputs.size(); i++) { + srcPDs.push_back(inputs[i]->getPrimitiveDesc()); + } + CHECK(out); + pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs)); + CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc()); +} + +void MKLDNNConcatLayer::resetFwdPipeline( + std::vector& pipeline, + std::shared_ptr& pd, + std::vector& inputs, + MKLDNNMatrixPtr& out) { + std::vector srcs; + for (size_t i = 0; i < inputs.size(); i++) { + srcs.push_back(*(inputs[i])); + } + fwd_.reset(new concat(*pd, srcs, *out)); + pipeline.push_back(*fwd_); +} + +void MKLDNNConcatLayer::resetBwdBuffers(std::vector& inputs, + MKLDNNMatrixPtr& out) { + CHECK(outVal_); + resetOutGrad(out, outVal_->getPrimitiveDesc()); + CHECK(out); + + inputs.resize(inputLayers_.size()); + for (size_t i = 0; i < inputs.size(); i++) { + CHECK(inVals_[i]); + // resetInGrad will use inVal_ + // TODO(TJ): change move inVals_ to MKLDNNLayer ans remove inVal_ + inVal_ = inVals_[i]; + resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i); + CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc()); + } + // change back, inVal_ always save the input 0 + inVal_ = inVals_[0]; +} + +void MKLDNNConcatLayer::resetBwdPipeline( + std::vector& pipeline, + std::vector>& prims, + std::vector& inputs, + MKLDNNMatrixPtr& out) { + // reset the backward primitives + memory::dims offsets = {0, 0, 0, 0}; + prims.resize(inputs.size()); + CHECK_EQ(inputs.size(), channels_.size()); + for (size_t i = 0; i < inputs.size(); i++) { + auto viewPD = view::primitive_desc( + out->getPrimitiveDesc(), inputs[i]->getDims(), offsets); + auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(), + inputs[i]->getPrimitiveDesc()); + prims[i].reset(new reorder(bwdPD, *out, *(inputs[i]))); + offsets[axis_] += channels_[i]; + // push to pipeline + pipeline.push_back(*prims[i]); + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.h b/paddle/gserver/layers/MKLDNNConcatLayer.h new file mode 100644 index 0000000000000000000000000000000000000000..d5749d327e4259b81541a234f48a4538ab035fe4 --- /dev/null +++ b/paddle/gserver/layers/MKLDNNConcatLayer.h @@ -0,0 +1,129 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "MKLDNNLayer.h" +#include "mkldnn.hpp" + +namespace paddle { + +/** + * @brief A subclass of MKLDNNLayer Concatenate layer. + * + * The config file api is mkldnn_concat + */ +class MKLDNNConcatLayer : public MKLDNNLayer { +protected: + std::vector inVals_; + std::vector inGrads_; + std::vector> bwds_; + // input channel numbers + std::vector channels_; + + // concat_dimension in MKLDNN + // if axis_ == 0, concat batchsize + // if axis_ == 1, concat channel (default) + int axis_; + +public: + explicit MKLDNNConcatLayer(const LayerConfig& config) + : MKLDNNLayer(config), axis_(1) {} + + ~MKLDNNConcatLayer() {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void reshape( + int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override; + + void resetFwd(std::vector& pipeline, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& wgt, + MKLDNNMatrixPtr& bias, + MKLDNNMatrixPtr& out) override; + + void resetBwd(std::vector& pipeline, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& wgt, + MKLDNNMatrixPtr& bias, + MKLDNNMatrixPtr& out) override; + + void printSizeInfo() override { + CHECK_EQ(channels_.size(), inputLayers_.size()); + for (size_t i = 0; i < channels_.size(); ++i) { + VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName() + << ": " << bs_ << ", " << channels_[i] << ", " << ih_ + << ", " << iw_; + } + VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_ + << ", " << ow_; + } + + void printValueFormat() override { + for (size_t i = 0; i < inVals_.size(); ++i) { + VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName() + << ": " << inVals_[i]->getFormat() << " >>>"; + } + if (outVal_) { + VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "; + } + if (extOutVal_) { + VLOG(MKLDNN_FMTS) << extOutVal_->getFormat(); + } + } + + void printGradFormat() override { + if (extOutGrad_) { + VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat(); + } + if (outGrad_) { + VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "; + } + for (size_t i = 0; i < inGrads_.size(); ++i) { + VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName() + << ": " << inGrads_[i]->getFormat() << "<<<"; + } + } + +protected: + /** + * Forward functions: reset buffers(inputs, output, bias), + * reset primitive descriptor, + * reset pipeline. + */ + void resetFwdBuffers(std::vector& inputs, + MKLDNNMatrixPtr& out); + void resetFwdPD(std::shared_ptr& pd, + std::vector& inputs, + MKLDNNMatrixPtr out); + void resetFwdPipeline(std::vector& pipeline, + std::shared_ptr& pd, + std::vector& inputs, + MKLDNNMatrixPtr& out); + + /** + * Backward functions: reset buffers(inputs, output, bias) + * reset primitives and pipeline + */ + void resetBwdBuffers(std::vector& inputs, + MKLDNNMatrixPtr& out); + void resetBwdPipeline(std::vector& pipeline, + std::vector>& prims, + std::vector& inputs, + MKLDNNMatrixPtr& out); +}; + +} // namespace paddle diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp index e75ac5ba4647a8267b7bc189893bd7adb5c3053f..cf42da0735282d667d6b87061c8c59bf2f96e0be 100644 --- a/paddle/gserver/layers/MKLDNNLayer.cpp +++ b/paddle/gserver/layers/MKLDNNLayer.cpp @@ -21,8 +21,8 @@ namespace paddle { bool MKLDNNLayer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { - CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn." - << "Please set WITH_MKLDNN=ON " + CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn." + << "Please set WITH_MKL=ON " << "and set use_mkldnn=True"; CHECK(!useGpu_) << "Do not support GPU yet"; @@ -138,8 +138,11 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) { } } -void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) { - const Argument& input = inputLayers_[0]->getOutput(); +void MKLDNNLayer::reshapeInput(int& batchsize, + int& height, + int& width, + size_t inputIdx) { + const Argument& input = inputLayers_[inputIdx]->getOutput(); batchsize = input.getBatchSize(); int h = input.getFrameHeight(); int w = input.getFrameWidth(); diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index 7479c34c92b5231b2521493bc631474d4efd4224..4c42df1bee75fa7b28c2001c30797cc0df7c5554 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -178,7 +178,10 @@ protected: /** * reshape the input image sizes and input batchsize */ - void reshapeInput(int& batchsize, int& height, int& width); + void reshapeInput(int& batchsize, + int& height, + int& width, + size_t inputIdx = 0); /** * reshape output image sizes diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index 4bea348f637f39444e8aad89278e6366ecd73b1d..c295ea19c9ccb3d05c509a41925d2c36efdba8ef 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -29,7 +29,7 @@ gserver_test(test_KmaxSeqScore) gserver_test(test_Expand) gserver_test(test_MaxPoolingWithMaskOutput) -########## test_Mkldnn layers and activations ########## +########## test_MKLDNN layers and activations ########## if(WITH_MKLDNN) add_unittest_without_exec(test_MKLDNN test_MKLDNN.cpp @@ -62,17 +62,6 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE) endif() if(NOT MOBILE_INFERENCE) -################### test_ProtoDataProvider ############ - add_unittest_without_exec(test_ProtoDataProvider - test_ProtoDataProvider.cpp) - - # test_ProtoDataProvider will mkdir as same name, - # so if WORKING_DIRECTORY is default directory, then - # mkdir will get error. - add_test(NAME test_ProtoDataProvider - COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) - ################## test_Evaluator ####################### add_unittest(test_Evaluator test_Evaluator.cpp) @@ -110,3 +99,24 @@ add_test(NAME test_PyDataProvider2 COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2 WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle ) + +################# test_CompareSparse ################## +add_unittest_without_exec(test_CompareSparse + test_CompareSparse.cpp) +if(NOT ON_TRAVIS) + add_test(NAME test_CompareSparse + COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d + ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests + ./.set_port.sh -p port -n 6 + ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) +endif() + +################ test_CompareTwoNets ###################### +add_unittest_without_exec(test_CompareTwoNets + test_CompareTwoNets.cpp) +add_test(NAME test_CompareTwoNets + COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d + ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests + ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h index ca55a45bc77b4e171619ab788d7c7dfeefcd036a..9d61533c0b6f20c41130d7b7c15ad93392b2d24c 100644 --- a/paddle/gserver/tests/MKLDNNTester.h +++ b/paddle/gserver/tests/MKLDNNTester.h @@ -23,7 +23,7 @@ limitations under the License. */ namespace paddle { /** - * @brief test the functionality of Mkldnnlayers + * @brief test the functionality of MKLDNNlayers and MKLDNNActivations * refer to paddle original function */ class MKLDNNTester { diff --git a/paddle/gserver/tests/proto_files.txt b/paddle/gserver/tests/proto_files.txt deleted file mode 100644 index 691b38c7940bd21360eb00384e060554aa4b3e22..0000000000000000000000000000000000000000 --- a/paddle/gserver/tests/proto_files.txt +++ /dev/null @@ -1,2 +0,0 @@ -./test_ProtoDataProvider/data1.bin -./test_ProtoDataProvider/data2.bin diff --git a/paddle/gserver/tests/proto_files_compressed.txt b/paddle/gserver/tests/proto_files_compressed.txt deleted file mode 100644 index 7413c81e185d02e0d03aefa06480b9722357c5eb..0000000000000000000000000000000000000000 --- a/paddle/gserver/tests/proto_files_compressed.txt +++ /dev/null @@ -1,2 +0,0 @@ -./test_ProtoDataProvider/data1.bin.gz -./test_ProtoDataProvider/data2.bin.gz diff --git a/paddle/gserver/tests/sequence_lstm.conf b/paddle/gserver/tests/sequence_lstm.conf new file mode 100644 index 0000000000000000000000000000000000000000..f49a827f22edce056eaf9903e99b732cab7f3784 --- /dev/null +++ b/paddle/gserver/tests/sequence_lstm.conf @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +######################## data source ################################ +dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict' +dict_file = dict() +for line_count, line in enumerate(open(dict_path, "r")): + dict_file[line.strip()] = line_count + +define_py_data_sources2( + train_list='gserver/tests/Sequence/train.list', + test_list=None, + module='sequenceGen', + obj='process', + args={"dict_file": dict_file}) + +settings(batch_size=5) +######################## network configure ################################ +dict_dim = len(open(dict_path, 'r').readlines()) +word_dim = 128 +hidden_dim = 256 +label_dim = 3 +sparse_update = get_config_arg("sparse_update", bool, False) + +data = data_layer(name="word", size=dict_dim) + +emb = embedding_layer( + input=data, + size=word_dim, + param_attr=ParamAttr(sparse_update=sparse_update)) + +with mixed_layer(size=hidden_dim * 4) as lstm_input: + lstm_input += full_matrix_projection(input=emb) + +lstm = lstmemory( + input=lstm_input, + act=TanhActivation(), + gate_act=SigmoidActivation(), + state_act=TanhActivation()) + +lstm_last = last_seq(input=lstm) + +with mixed_layer( + size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output: + output += full_matrix_projection(input=lstm_last) + +outputs( + classification_cost( + input=output, label=data_layer( + name="label", size=1))) diff --git a/paddle/gserver/tests/sequence_recurrent.py b/paddle/gserver/tests/sequence_recurrent.py new file mode 100644 index 0000000000000000000000000000000000000000..4895df186bfecc5cb5263676a9cd5bac5039d565 --- /dev/null +++ b/paddle/gserver/tests/sequence_recurrent.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +######################## data source ################################ +dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict' +dict_file = dict() +for line_count, line in enumerate(open(dict_path, "r")): + dict_file[line.strip()] = line_count + +define_py_data_sources2( + train_list='gserver/tests/Sequence/train.list', + test_list=None, + module='sequenceGen', + obj='process', + args={"dict_file": dict_file}) + +settings(batch_size=5) +######################## network configure ################################ +dict_dim = len(open(dict_path, 'r').readlines()) +word_dim = 128 +hidden_dim = 128 +label_dim = 3 + +# This config is designed to be equivalent with sequence_recurrent_group.py + +data = data_layer(name="word", size=dict_dim) + +emb = embedding_layer( + input=data, size=word_dim, param_attr=ParamAttr(name="emb")) + +recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation()) + +recurrent_last = last_seq(input=recurrent) + +with mixed_layer( + size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output: + output += full_matrix_projection(input=recurrent_last) + +outputs( + classification_cost( + input=output, label=data_layer( + name="label", size=1))) diff --git a/paddle/gserver/tests/sequence_recurrent_group.py b/paddle/gserver/tests/sequence_recurrent_group.py new file mode 100644 index 0000000000000000000000000000000000000000..a1d54542e3bc4e89f70d31d5e89c0f44953c9f90 --- /dev/null +++ b/paddle/gserver/tests/sequence_recurrent_group.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +######################## data source ################################ +dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict' +dict_file = dict() +for line_count, line in enumerate(open(dict_path, "r")): + dict_file[line.strip()] = line_count + +define_py_data_sources2( + train_list='gserver/tests/Sequence/train.list', + test_list=None, + module='sequenceGen', + obj='process', + args={"dict_file": dict_file}) + +settings(batch_size=5) +######################## network configure ################################ +dict_dim = len(open(dict_path, 'r').readlines()) +word_dim = 128 +hidden_dim = 128 +label_dim = 3 + +# This config is designed to be equivalent with sequence_recurrent.py + +data = data_layer(name="word", size=dict_dim) + +emb = embedding_layer( + input=data, size=word_dim, param_attr=ParamAttr(name="emb")) + + +def step(y): + mem = memory(name="rnn_state", size=hidden_dim) + with mixed_layer( + name="rnn_state", + size=hidden_dim, + bias_attr=False, + act=SoftmaxActivation()) as out: + out += identity_projection(input=y) + out += full_matrix_projection( + input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__")) + return out + + +recurrent = recurrent_group(name="rnn", step=step, input=emb) + +recurrent_last = last_seq(input=recurrent) + +with mixed_layer( + size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output: + output += full_matrix_projection(input=recurrent_last) + +outputs( + classification_cost( + input=output, label=data_layer( + name="label", size=1))) diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/gserver/tests/test_CompareSparse.cpp similarity index 98% rename from paddle/trainer/tests/test_CompareSparse.cpp rename to paddle/gserver/tests/test_CompareSparse.cpp index 5f1834bd730375fc10762fc19788d0c693f8e752..c6e07650fc4805a25baf38b9059f6c996d00cafc 100644 --- a/paddle/trainer/tests/test_CompareSparse.cpp +++ b/paddle/gserver/tests/test_CompareSparse.cpp @@ -22,8 +22,7 @@ limitations under the License. */ using namespace paddle; // NOLINT using namespace std; // NOLINT -static const string& configFile1 = - "trainer/tests/sample_trainer_config_compare_sparse.conf"; +static const string& configFile1 = "gserver/tests/sequence_lstm.conf"; DECLARE_bool(use_gpu); DECLARE_string(config); diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/gserver/tests/test_CompareTwoNets.cpp similarity index 95% rename from paddle/trainer/tests/test_CompareTwoNets.cpp rename to paddle/gserver/tests/test_CompareTwoNets.cpp index 94f65e545d116c802fb4877dc14f07aaaf83a4fb..801d9607565910b1f7f68a9c4532de5877e44f30 100644 --- a/paddle/trainer/tests/test_CompareTwoNets.cpp +++ b/paddle/gserver/tests/test_CompareTwoNets.cpp @@ -30,8 +30,6 @@ DECLARE_bool(use_gpu); DECLARE_string(config); DECLARE_string(nics); -DEFINE_string(config_file_a, "", "config of one network to compare"); -DEFINE_string(config_file_b, "", "config of another network to compare"); DEFINE_bool(need_high_accuracy, false, "whether need to run in double accuracy"); @@ -42,6 +40,10 @@ DEFINE_double( DECLARE_bool(thread_local_rand_use_global_seed); DECLARE_int32(seed); +static const string& config_file_a = "gserver/tests/sequence_recurrent.py"; +static const string& config_file_b = + "gserver/tests/sequence_recurrent_group.py"; + struct ComData { vector outArgs; vector parameters; @@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) { DataBatch dataBatch; int32_t batchSize = trainer.getConfig().opt_config().batch_size(); + trainer.getDataProvider()->reset(); trainer.getDataProvider()->setSkipShuffle(); trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch); @@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) { TEST(Trainer, create) { ComData dataA; - calcGradient(dataA, FLAGS_config_file_a); + calcGradient(dataA, config_file_a); LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n"; ComData dataB; - calcGradient(dataB, FLAGS_config_file_b); + calcGradient(dataB, config_file_b); LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n"; compareGradient(dataA, dataB); diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 18f8d602b223cb4f2f620417862264776e752620..cacf10692942f5eca2f6c498183f4acc00768460 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -1082,6 +1082,21 @@ TEST(Layer, InterpolationLayer) { } } +TEST(Layer, DotProdLayer) { + TestConfig config; + config.layerConfig.set_type("dot_prod"); + config.layerConfig.set_size(1); + + config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0}); + config.layerConfig.add_inputs(); + config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0}); + config.layerConfig.add_inputs(); + + for (auto useGpu : {false, true}) { + testLayerGrad(config, "dot_prod", 10, false, useGpu); + } +} + TEST(Layer, OuterProdLayer) { TestConfig config; config.layerConfig.set_type("out_prod"); diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp index a859e34c8996d81f14bf1edcb6e23d5a4f687e6b..42644e9601a82ea81c417adc6441edeb036998e2 100644 --- a/paddle/gserver/tests/test_MKLDNN.cpp +++ b/paddle/gserver/tests/test_MKLDNN.cpp @@ -313,6 +313,47 @@ TEST(MKLDNNLayer, AddtoLayer) { testAddtoLayer({4, 12, 1, 1}, 3); } +static void getMKLDNNConcatConfig(TestConfig& cfg, + const std::vector& inputs) { + CHECK_GE(inputs.size(), 2) << "at least two inputs"; + int oc = inputs[0].ic; + for (size_t i = 1; i < inputs.size(); ++i) { + CHECK_EQ(inputs[i].bs, inputs[0].bs); + CHECK_EQ(inputs[i].ih, inputs[0].ih); + CHECK_EQ(inputs[i].iw, inputs[0].iw); + oc += inputs[i].ic; + } + cfg.biasSize = 0; + cfg.layerConfig.set_type("mkldnn_concat"); + cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw); + cfg.layerConfig.set_active_type("relu"); + for (size_t i = 0; i < inputs.size(); ++i) { + std::stringstream ss; + ss << "layer_" << i; + cfg.inputDefs.push_back( + {INPUT_DATA, + ss.str(), + (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw, + 0}); + LayerInputConfig* input = cfg.layerConfig.add_inputs(); + ImageConfig* img_conf = input->mutable_image_conf(); + img_conf->set_channels(inputs[i].ic); + img_conf->set_img_size_y(inputs[i].ih); + img_conf->set_img_size(inputs[i].iw); + } +} + +void testConcatLayer(const std::vector& inputs) { + TestConfig dnnConfig; + getMKLDNNConcatConfig(dnnConfig, inputs); + RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0]) +} + +TEST(MKLDNNLayer, ConcatLayer) { + testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}}); + testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}}); +} + void testActivation(std::string actType, const testImageDesc& pm) { // TODO(TJ): remove me when paddle support elu activation if (actType == "mkldnn_elu") { diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp deleted file mode 100644 index af6472619d1840e82787974d265d601b4a406c09..0000000000000000000000000000000000000000 --- a/paddle/gserver/tests/test_ProtoDataProvider.cpp +++ /dev/null @@ -1,732 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include - -#include "paddle/gserver/dataproviders/ProtoDataProvider.h" -#include "paddle/utils/Util.h" - -#include "paddle/testing/TestUtil.h" - -using namespace std; // NOLINT - -std::vector protoFiles{ - "./test_ProtoDataProvider/data1.bin", "./test_ProtoDataProvider/data2.bin", -}; -std::vector protoFilesCompressed{ - "./test_ProtoDataProvider/data1.bin.gz", - "./test_ProtoDataProvider/data2.bin.gz", -}; - -const char* kTestDir = "./test_ProtoDataProvider"; -const char kProtoFileList[] = "gserver/tests/proto_files.txt"; -const char kProtoFileListCompressed[] = - "gserver/tests/proto_files_compressed.txt"; -const int kSpraseMatrixDim = 1024; - -using namespace paddle; // NOLINT - -void prepareData(DataBatch* batch, - const int* numPerSlotType, - bool iid, - bool useGpu) { - batch->clear(); - int64_t size = uniformRandom(100) + 10; - batch->setSize(size); - - ICpuGpuVectorPtr sequenceStartPositions; - ICpuGpuVectorPtr subSequenceStartPositions; - if (!iid) { - int numSeqs = uniformRandom(10) + 1; - sequenceStartPositions = - ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false); - int* buf = sequenceStartPositions->getMutableData(false); - subSequenceStartPositions = - ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false); - int* subBuf = subSequenceStartPositions->getMutableData(false); - int64_t pos = 0; - int maxLen = 2 * size / numSeqs; - for (int i = 0; i < numSeqs; ++i) { - int len = - uniformRandom(min(maxLen, size - pos - numSeqs + i)) + 1; - buf[i] = pos; - subBuf[i] = pos; - pos += len; - VLOG(1) << " len=" << len; - } - buf[numSeqs] = size; - subBuf[numSeqs] = size; - } - - vector& arguments = batch->getStreams(); - for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_DENSE]; ++i) { - int64_t dim = rand() % 10 + 4; // NOLINT rand_r - MatrixPtr mat = Matrix::create(size, dim, /* trans= */ false, false); - mat->randomizeUniform(); - Argument arg; - arg.value = mat; - arg.sequenceStartPositions = sequenceStartPositions; - arguments.push_back(arg); - } - for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE]; ++i) { - MatrixPtr mat = - makeRandomSparseMatrix(size, kSpraseMatrixDim, false, useGpu); - Argument arg; - arg.value = mat; - arg.sequenceStartPositions = sequenceStartPositions; - arg.subSequenceStartPositions = subSequenceStartPositions; - arguments.push_back(arg); - } - for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE]; ++i) { - MatrixPtr mat = - makeRandomSparseMatrix(size, kSpraseMatrixDim, true, useGpu); - Argument arg; - arg.value = mat; - arg.sequenceStartPositions = sequenceStartPositions; - arguments.push_back(arg); - } - for (int i = 0; i < numPerSlotType[SlotDef::STRING]; ++i) { - int64_t dim = rand() % 10 + 4; // NOLINT rand_r - SVectorPtr vec = std::make_shared>(); - for (int j = 0; j < size; ++j) { - vec->push_back(randStr(dim)); - } - Argument arg; - arg.strs = vec; - arg.sequenceStartPositions = sequenceStartPositions; - arguments.push_back(arg); - } - for (int i = 0; i < numPerSlotType[SlotDef::INDEX]; ++i) { - int64_t dim = rand() % 10 + 4; // NOLINT rand_r - IVectorPtr vec = IVector::create(size, /* useGpu= */ false); - int* buf = vec->getData(); - for (int j = 0; j < size; ++j) { - buf[j] = uniformRandom(dim); - } - Argument arg; - arg.ids = vec; - arg.sequenceStartPositions = sequenceStartPositions; - arguments.push_back(arg); - } -} - -inline int getSlotDim(const Argument& arg) { - if (arg.value) { - return arg.value->getWidth(); - } else if (arg.ids) { - return arg.ids->getMax() + 1; - } else if (arg.strs) { - return 1; - } - LOG(FATAL) << "Invalid argument"; - return 0; -} - -inline SlotDef::SlotType getSlotType(const Argument& arg) { - if (arg.value) { - auto& m = *arg.value; - auto& type = typeid(m); - if (type == typeid(CpuMatrix) || type == typeid(GpuMatrix)) { - return SlotDef::VECTOR_DENSE; - } - if (type == typeid(CpuSparseMatrix)) { - auto valueType = - std::dynamic_pointer_cast(arg.value)->getValueType(); - if (NO_VALUE == valueType) { - return SlotDef::VECTOR_SPARSE_NON_VALUE; - } else { - return SlotDef::VECTOR_SPARSE_VALUE; - } - } - if (type == typeid(GpuSparseMatrix)) { - auto valueType = - std::dynamic_pointer_cast(arg.value)->getValueType(); - if (NO_VALUE == valueType) { - return SlotDef::VECTOR_SPARSE_NON_VALUE; - } else { - return SlotDef::VECTOR_SPARSE_VALUE; - } - } - - LOG(FATAL) << "Unknown matrix type"; - } - if (arg.ids) return SlotDef::INDEX; - if (arg.strs) return SlotDef::STRING; - LOG(FATAL) << "Invalid argument"; - return SlotDef::VECTOR_DENSE; -} - -void getColRow(const Argument& arg, - int64_t pos, - bool useGpu, - int* colNum, - const int** rowCols, - const real** rowValues) { - SlotDef::SlotType type = getSlotType(arg); - GpuSparseMatrixPtr matGpu; - CpuSparseMatrixPtr matCpu; - if (useGpu) { - matGpu = dynamic_pointer_cast(arg.value); - ASSERT_TRUE(matGpu != NULL); - } else { - matCpu = dynamic_pointer_cast(arg.value); - ASSERT_TRUE(matCpu != NULL); - } - *colNum = useGpu ? matGpu->getColNum(pos) : matCpu->getColNum(pos); - *rowCols = useGpu ? matGpu->getRowCols(pos) : matCpu->getRowCols(pos); - if (type == SlotDef::VECTOR_SPARSE_VALUE) { - *rowValues = useGpu ? matGpu->getRowValues(pos) : matCpu->getRowValues(pos); - } else { - *rowValues = NULL; - } -} - -void makeSample(const vector& arguments, - int64_t pos, - bool isBeginning, - DataSample* sample, - bool useGpu) { - sample->set_is_beginning(isBeginning); - int slotid = 0; - for (auto& arg : arguments) { - SlotDef::SlotType type = getSlotType(arg); - int64_t dim = getSlotDim(arg); - switch (type) { - case SlotDef::VECTOR_DENSE: { - VectorSlot* vecSlot = sample->add_vector_slots(); - auto values = vecSlot->mutable_values(); - values->Reserve(dim); - for (int i = 0; i < dim; ++i) { - values->AddAlreadyReserved( - static_cast(arg.value->getElement(pos, i))); - } - break; - } - case SlotDef::INDEX: { - sample->add_id_slots(arg.ids->get(pos)); - break; - } - case SlotDef::VECTOR_SPARSE_NON_VALUE: { - VectorSlot* vecSlot = sample->add_vector_slots(); - auto ids = vecSlot->mutable_ids(); - int colNum; - const int* rowCols; - const real* rowValues; // nullptr - getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues); - ids->Reserve(colNum); - for (int i = 0; i < colNum; ++i) { - ids->AddAlreadyReserved(rowCols[i]); - } - SubseqSlot* subseqSlot = sample->add_subseq_slots(); // subseq - subseqSlot->set_slot_id(slotid); - auto lens = subseqSlot->mutable_lens(); - lens->Add(colNum); - break; - } - case SlotDef::VECTOR_SPARSE_VALUE: { - VectorSlot* vecSlot = sample->add_vector_slots(); - auto values = vecSlot->mutable_values(); - auto ids = vecSlot->mutable_ids(); - int colNum; - const int* rowCols; - const real* rowValues; - getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues); - ids->Reserve(colNum); - values->Reserve(colNum); - for (int i = 0; i < colNum; ++i) { - ids->AddAlreadyReserved(rowCols[i]); - values->AddAlreadyReserved(rowValues[i]); - } - break; - } - case SlotDef::VAR_MDIM_DENSE: - case SlotDef::VAR_MDIM_INDEX: { - LOG(FATAL) << "Not implemented"; - break; - } - case SlotDef::STRING: { - VectorSlot* vecSlot = sample->add_vector_slots(); - vecSlot->add_strs((*arg.strs)[pos]); - break; - } - } - slotid++; - } -} - -void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) { - DataHeader header; - const vector& arguments = batch.getStreams(); - for (auto& argument : arguments) { - SlotDef* slotDef = header.add_slot_defs(); - slotDef->set_type(getSlotType(argument)); - slotDef->set_dim(getSlotDim(argument)); - } - VLOG(1) << "header=" << header.DebugString(); - - int64_t totalSeqs = batch.getNumSequences(); - int64_t seq = 0; - ICpuGpuVectorPtr sequenceStartPositions = arguments[0].sequenceStartPositions; - int64_t numWritten = 0; - vector curProtoFiles = - dataCompression ? protoFilesCompressed : protoFiles; - for (size_t i = 0; i < curProtoFiles.size(); ++i) { - int64_t numSeqs = totalSeqs * (i + 1) / curProtoFiles.size() - - totalSeqs * i / curProtoFiles.size(); - ofstream os(curProtoFiles[i]); - CHECK(os) << "Fail to open " << curProtoFiles[i]; - unique_ptr writer(new ProtoWriter(&os, dataCompression)); - CHECK(writer->write(header)); - for (int j = 0; j < numSeqs; ++j, ++seq) { - int64_t begin = seq; - int64_t end = seq + 1; - if (sequenceStartPositions) { - begin = sequenceStartPositions->getElement(seq); - end = sequenceStartPositions->getElement(seq + 1); - } - for (int pos = begin; pos < end; ++pos) { - DataSample sample; - makeSample(arguments, pos, pos == begin, &sample, useGpu); - CHECK(writer->write(sample)); - ++numWritten; - } - } - - writer.reset(nullptr); - os.close(); - } - CHECK_EQ(arguments[0].getBatchSize(), numWritten); -} - -// check that the sample at pos1 in args1 is same as the sample at pos2 in args2 -void checkSample(const vector& args1, - int64_t pos1, - const vector& args2, - int64_t pos2, - bool useGpu) { - EXPECT_EQ(args1.size(), args2.size()); - VLOG(1) << " pos1=" << pos1 << " pos2=" << pos2; - - for (size_t i = 0; i < args1.size(); ++i) { - auto type = getSlotType(args1[i]); - int dim = getSlotDim(args1[i]); - EXPECT_EQ(type, getSlotType(args2[i])); - if (type == SlotDef::INDEX) { - EXPECT_GE(dim, getSlotDim(args2[i])); - } else { - EXPECT_EQ(dim, getSlotDim(args2[i])); - } - switch (type) { - case SlotDef::VECTOR_DENSE: { - for (int j = 0; j < dim; ++j) { - EXPECT_EQ(static_cast(args1[i].value->getElement(pos1, j)), - static_cast(args2[i].value->getElement(pos2, j))); - } - break; - } - case SlotDef::INDEX: { - EXPECT_EQ(args1[i].ids->get(pos1), args2[i].ids->get(pos2)); - break; - } - case SlotDef::VECTOR_SPARSE_NON_VALUE: - case SlotDef::VECTOR_SPARSE_VALUE: { - int colNum1, colNum2; - const int *rowCols1, *rowCols2; - const real *rowValues1, *rowValues2; - getColRow(args1[i], pos1, useGpu, &colNum1, &rowCols1, &rowValues1); - getColRow(args2[i], pos2, useGpu, &colNum2, &rowCols2, &rowValues2); - EXPECT_EQ(colNum1, colNum2); - for (int j = 0; j < colNum1; ++j) { - EXPECT_EQ(rowCols1[j], rowCols2[j]); - if (type == SlotDef::VECTOR_SPARSE_VALUE) { - EXPECT_EQ(rowValues1[j], rowValues2[j]); - } - } - break; - } - case SlotDef::VAR_MDIM_DENSE: - case SlotDef::VAR_MDIM_INDEX: { - LOG(FATAL) << "Not implemented"; - break; - } - case SlotDef::STRING: { - EXPECT_EQ((*args1[i].strs)[pos1], (*args2[i].strs)[pos2]); - break; - } - } - } -} - -void testProtoDataProvider(int* numPerSlotType, - bool iid, - bool async, - bool useGpu, - bool dataCompression, - int numConstantSlots = 0) { - mkDir(kTestDir); - DataBatch data; - - prepareData(&data, numPerSlotType, iid, useGpu); - writeData(data, useGpu, dataCompression); - - DataConfig config; - config.set_type("proto"); - config.set_files(dataCompression ? kProtoFileListCompressed : kProtoFileList); - config.set_async_load_data(async); - - for (int i = 0; i < numConstantSlots; ++i) { - config.add_constant_slots(i + 11); - MatrixPtr w = Matrix::create(data.getSize(), - 1, - /* trans= */ false, - /* useGpu= */ false); - w->assign(config.constant_slots(i)); - data.appendData(w); - } - - unique_ptr dataProvider(DataProvider::create(config, useGpu)); - dataProvider->setSkipShuffle(); - - EXPECT_EQ(data.getSize(), dataProvider->getSize()); - - int64_t batchSize = 10; - DataBatch batch; - - size_t seq1 = 0; - vector& args1 = data.getStreams(); - ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions; - - dataProvider->reset(); - - while (dataProvider->getNextBatch(batchSize, &batch) > 0) { - CHECK_EQ(data.getNumStreams(), batch.getNumStreams()); - vector& args2 = batch.getStreams(); - ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions; - for (auto& arg : args2) { - EXPECT_EQ(iid, !arg.sequenceStartPositions); - } - size_t numSeqs = batch.getNumSequences(); - VLOG(1) << "numSeqs=" << numSeqs; - for (size_t seq2 = 0; seq2 < numSeqs; ++seq1, ++seq2) { - int64_t begin1 = seq1; - int64_t end1 = seq1 + 1; - if (sequenceStartPositions1) { - begin1 = sequenceStartPositions1->getElement(seq1); - end1 = sequenceStartPositions1->getElement(seq1 + 1); - EXPECT_LT(seq1, sequenceStartPositions1->getSize() - 1); - } - - int64_t begin2 = seq2; - int64_t end2 = seq2 + 1; - if (sequenceStartPositions2) { - begin2 = sequenceStartPositions2->getElement(seq2); - end2 = sequenceStartPositions2->getElement(seq2 + 1); - } - VLOG(1) << " begin1=" << begin1 << " end1=" << end1 - << " begin2=" << begin2 << " end2=" << end2; - EXPECT_EQ(end1 - begin1, end2 - begin2); - for (int i = 0; i < end1 - begin1; ++i) { - checkSample(args1, begin1 + i, args2, begin2 + i, useGpu); - } - } - } - - EXPECT_EQ(seq1, (size_t)data.getNumSequences()); - rmDir(kTestDir); -} - -TEST(ProtoDataProvider, test) { - int numSlotsArray[] = {0, 3}; - int numTwoArray[] = {0, 1}; - int numSlotsArraySize = sizeof(numSlotsArray) / sizeof(numSlotsArray[0]); - const int numSlot = 5; - int combination[numSlot] = {0}; - int k = numSlot - 1; - while (k >= 0) { - int numDenseVecSlots = numSlotsArray[combination[0]]; - int numSparseNonValueVecSlots = numSlotsArray[combination[1]]; - int numSparseValueVectorSlots = numSlotsArray[combination[2]]; - int numStrSlots = numSlotsArray[combination[3]]; - int numIdSlots = numSlotsArray[combination[4]]; - // while loop : traverse all cases - k = numSlot - 1; - while (k >= 0) { - if (combination[k] < (numSlotsArraySize - 1)) { - ++combination[k]; - break; - } else { - combination[k] = 0; - --k; - } - } - if (numDenseVecSlots + numSparseNonValueVecSlots + - numSparseValueVectorSlots + numStrSlots + numIdSlots < - 1) - continue; - for (int iid : numTwoArray) { - for (int async : numTwoArray) { - for (int useGpu : numTwoArray) { - for (int dataCompression : numTwoArray) { - if (async && useGpu) { - // Currently in async mode, useGpu is not supported - continue; - } -#ifndef PADDLE_WITH_CUDA - if (useGpu) { - continue; - } -#endif - LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots - << " numSparseNonValueVecSlots=" - << numSparseNonValueVecSlots - << " numSparseValueVectorSlots=" - << numSparseValueVectorSlots - << " numStrSlots=" << numStrSlots - << " numIdSlots=" << numIdSlots << " iid=" << iid - << " async=" << async << " useGpu=" << useGpu - << " dataCompression=" << dataCompression; - int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0}; - numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots; - numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] = - numSparseNonValueVecSlots; - numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] = - numSparseValueVectorSlots; - numPerSlotType[SlotDef::INDEX] = numIdSlots; - numPerSlotType[SlotDef::STRING] = numStrSlots; - testProtoDataProvider( - numPerSlotType, iid, async, useGpu, dataCompression); - } // end for (int dataCompression : numTwoArray) - } // end for (int useGpu : numTwoArray) - } // end for (int async : numTwoArray) - } // end for (int iid : numTwoArray) - } // end for (while, traverse all slots) -} - -TEST(ProtoDataProvider, constant_slots) { - int numSlotsArray[] = {0, 3}; - int numTwoArray[] = {0, 1}; - for (int numDenseVecSlots : numSlotsArray) { - for (int numSparseNonValueVecSlots : numSlotsArray) { - if (numDenseVecSlots + numSparseNonValueVecSlots < 1) continue; - for (int numConstantSlots : {1, 2}) { - for (int useGpu : numTwoArray) { - for (int dataCompression : numTwoArray) { -#ifndef PADDLE_WITH_CUDA - if (useGpu) { - continue; - } -#endif - LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots - << " numSparseNonValueVecSlots=" - << numSparseNonValueVecSlots - << " numConstantSlogs=" << numConstantSlots - << " useGpu=" << useGpu - << " dataCompression=" << dataCompression; - int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0}; - numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots; - numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] = - numSparseNonValueVecSlots; - numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] = 1; - numPerSlotType[SlotDef::INDEX] = 1; - testProtoDataProvider(numPerSlotType, - /* iid= */ true, - /* async= */ false, - useGpu, - dataCompression, - numConstantSlots); - } // end for (int dataCompression : numTwoArray) - } // end for (int useGpu : numTwoArray) - } // end for (int numConstantSlots : {1, 2}) - } // end for (int numSparseNonValueVecSlots : numSlotsArray) - } // end for (int numDenseVecSlots : numSlotsArray) -} - -void checkSampleSequence(const vector& args1, - const vector& args2, - int64_t offset, - int64_t numSeqs, - bool useGpu) { - // check slot num are equal - EXPECT_EQ(args1.size(), args2.size()); - for (size_t i = 0; i < args1.size(); i++) { - auto type = getSlotType(args1[i]); - // check for args2: sequenceStartPositions vs numSeqs - // (1) size - EXPECT_EQ(args2[i].sequenceStartPositions->getSize(), (size_t)numSeqs + 1); - // (2) content - auto checkArgContent = [&](const Argument& args, int numSeqs) { - for (int j = 0; j <= numSeqs; j++) { - int start_pos = args.sequenceStartPositions->getElement(j); - EXPECT_EQ(start_pos, j); - } - }; - switch (type) { - case SlotDef::INDEX: { - // args1: for label - checkArgContent(args2[i], numSeqs); - // check for args2: ids are equal to args1[offset] - // (1) size - EXPECT_EQ(args2[i].ids->getSize(), (size_t)numSeqs); - // (2) content - for (int j = 0; j < numSeqs; j++) { - EXPECT_EQ(args2[i].ids->get(j), args1[i].ids->get(offset + j)); - } - break; - } - case SlotDef::VECTOR_SPARSE_NON_VALUE: { - // args1: for sparse_non_value - // args2 should put sparse indexes in ids - int colNum1; - const int* rowCols1; - const real* rowValues1; // nullptr - int totalLength = 0; - for (int j = 0; j < numSeqs; j++) { - getColRow( - args1[i], offset + j, useGpu, &colNum1, &rowCols1, &rowValues1); - // (1) lengths - EXPECT_EQ(totalLength, - args2[i].sequenceStartPositions->getElement(j)); - EXPECT_EQ(totalLength, - args2[i].subSequenceStartPositions->getElement(j)); - // (2) content - for (int k = 0; k < colNum1; k++) { - EXPECT_EQ(rowCols1[k], args2[i].ids->get(totalLength + k)); - } - totalLength += colNum1; - if (colNum1 == 0) { - // special case here: we will put a "-1" into ids when column num is - // zero. see ProtoSequenceDataProvider::getNextBatchInternal. - EXPECT_EQ(-1, args2[i].ids->get(totalLength)); - totalLength++; - } - } - EXPECT_EQ(totalLength, - args2[i].sequenceStartPositions->getElement(numSeqs)); - EXPECT_EQ(totalLength, - args2[i].subSequenceStartPositions->getElement(numSeqs)); - break; - } - case SlotDef::VECTOR_DENSE: { - // args1: for dense vector - checkArgContent(args2[i], numSeqs); - // check for args2: values are equal to args1[offset] - // (1) size - EXPECT_EQ(args2[i].value->getHeight(), (size_t)numSeqs); - EXPECT_EQ(args2[i].value->getWidth(), (size_t)getSlotDim(args1[i])); - // (2) content - for (int j = 0; j < numSeqs; j++) { - for (size_t k = 0; k < args2[i].value->getWidth(); k++) { - EXPECT_EQ( - static_cast(args1[i].value->getElement(j + offset, k)), - static_cast(args2[i].value->getElement(j, k))); - } - } - break; - } - default: { EXPECT_EQ(true, false) << "should not reach here"; } - } - } -} - -void testProtoSequenceDataProvider(int* numPerSlotType, - bool async, - bool useGpu) { - mkDir(kTestDir); - DataBatch data; - - prepareData(&data, - numPerSlotType, - /* iid */ true, - useGpu); - writeData(data, useGpu, /* dataCompression */ false); - - DataConfig config; - config.set_type("proto_sequence"); - config.set_files(kProtoFileList); - config.set_async_load_data(async); - - unique_ptr dataProvider(DataProvider::create(config, useGpu)); - dataProvider->setSkipShuffle(); - - EXPECT_EQ(data.getSize(), dataProvider->getSize()); - - int64_t batchSize = 10; - DataBatch batch; - - vector& args1 = data.getStreams(); - ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions; - - dataProvider->reset(); - - size_t args1Offset = 0; - while (dataProvider->getNextBatch(batchSize, &batch) > 0) { - CHECK_EQ(data.getNumStreams(), batch.getNumStreams()); - vector& args2 = batch.getStreams(); - ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions; - for (auto& arg : args1) { - // args1 should not has sequence - EXPECT_EQ(true, !arg.sequenceStartPositions); - } - for (auto& arg : args2) { - // args2 should has sequence - EXPECT_NE(true, !arg.sequenceStartPositions); - } - size_t numSeqs = batch.getNumSequences(); - checkSampleSequence(args1, args2, args1Offset, numSeqs, useGpu); - args1Offset += numSeqs; - } - - EXPECT_EQ(args1Offset, (size_t)data.getNumSequences()); - rmDir(kTestDir); -} - -TEST(ProtoSequenceDataProvider, test) { - int numSlotsArray[] = {0, 3}; - int numTwoArray[] = {0, 1}; - for (int numSparseNonValueVecSlots : numSlotsArray) { - for (int numIdSlots : numSlotsArray) { - for (int numDenseVecSlots : numSlotsArray) { - if (numDenseVecSlots + numSparseNonValueVecSlots + numIdSlots < 1) - continue; - for (int async : numTwoArray) { - for (int useGpu : numTwoArray) { - if (async && useGpu) { - // Currently in async mode, useGpu is not supported - continue; - } -#ifndef PADDLE_WITH_CUDA - if (useGpu) { - continue; - } -#endif - LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots - << " numSparseNonValueVecSlots=" - << numSparseNonValueVecSlots - << " numIdSlots=" << numIdSlots << " async=" << async - << " useGpu=" << useGpu; - int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0}; - numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots; - numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] = - numSparseNonValueVecSlots; - numPerSlotType[SlotDef::INDEX] = numIdSlots; - testProtoSequenceDataProvider(numPerSlotType, async, useGpu); - } // end for (int useGpu : numTwoArray) - } // end for (int async : numTwoArray) - } // end for (int numDenseVecSlots : numSlotsArray) - } // end for (int numIdSlots : numSlotsArray) - } // end for (int numSparseNonValueVecSlots : numSlotsArray) -} diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp index 4adaaef9838f0d178468af3af142031325bfc11d..a2ef731ecbcd18ca4bd0b2381de04650a2686c2d 100644 --- a/paddle/math/Storage.cpp +++ b/paddle/math/Storage.cpp @@ -17,9 +17,13 @@ limitations under the License. */ #include "paddle/utils/StringUtil.h" #include "paddle/utils/Util.h" +#ifndef PADDLE_MOBILE_INFERENCE DEFINE_int32(pool_limit_size, 536870912, "maximum memory size managed by a memory pool, default is 512M"); +#else +DEFINE_int32(pool_limit_size, 0, "default is 0"); +#endif namespace paddle { diff --git a/paddle/memory/README.md b/paddle/memory/README.md index 7f95e80f980b0c0b93ecb418e6b923045313eaa5..6cb003c50bc7d142d65b0591e7e5235431d2ea42 100644 --- a/paddle/memory/README.md +++ b/paddle/memory/README.md @@ -1,4 +1,141 @@ # Region-based Heterogeneous Memory Management +## Design -Please check out the [design documentation](http://gangliao.me) to find out more details about -buddy memory allocator for both CPU and GPU. +### Usage + +To allocate 4KB CPU memory: + +```cpp +p = memory::Alloc(platform::CPUPlace(), 4*1024); +``` + +To allocate 4KB memory on the 3rd GPU: + +```cpp +p = memory::Alloc(platform::GPUPlace(2), 4*1024); +``` + +To free memory and check the so-far used amount of memory on a place: + +```cpp +auto pl = platform::GPUPlace(0); +p = memory::Alloc(pl, 4*1024); +cout << memory::Used(pl); +memory::Free(pl, p); +``` + +### API + +In `paddle/memory/memory.h` we have: + +```cpp +namespace memory { +template void* Alloc(Place, size_t); +template void Free(Place, void*); +template size_t Used(Place); +} // namespace memory +``` + +These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`: + +```cpp +template<> +void* Alloc(CPUPlace p, size_t size) { + return GetCPUBuddyAllocator()->Alloc(size); +} +``` + +and + +```cpp +template<> +void Alloc(GPUPlace p, size_t size) { + return GetGPUBuddyAllocator(p.id)->Alloc(size); +} +``` + +Similar specializations exist for `Free` and `Used`. + +### Implementation + +`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions. + +```cpp +BuddyAllocator* GetCPUBuddyAllocator() { + static BuddyAllocator* a = NULL; + if (a == NULL) { + a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...); + } + return a; +} + +BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { + static BuddyAllocator* as = NULL; + if (as == NULL) { + as = new BuddyAllocator*[platform::NumGPUs()]; + for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) { + as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...); + } + } + return as[gpu_id); +``` + +#### `BuddyAllocator` + +`BuddyAllocator` implements the buddy allocation algorithm. Its constructor takes parameters only related with the algorithm: + +```cpp +BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) { + ... +} +``` + +Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object: + +```cpp +class BuddyAllocator { + private: + struct Block { + size_t size; + Block* left, right; + size_t index; // allocator id + }; + ... +}; +``` + +Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`. Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace. + +#### System Allocators + +The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They work as the fallback allocators of `BuddyAllocator`. + +## Justification + +I got inspiration from Majel and Caffe2, though above design look different from both. + +### Caffe2 + +In Caffe2, `Tensor::mutable_data()` allocates the memroy. In particular, [`Tensor::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479). + +There are two implementations of `Context`: + +1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory. + +1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory. + +### Majel + +In Majel, there are basically two allocator types: + +1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`. +1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`. + +However, memory allocation is not via these two allocators. Instead, these two allocators are defined in hidden namespaces. + +In Majel there are hidden global variables like: + +1. `cpu::SystemAllocator g_cpu_allocator`, and +1. `vector g_gpu_allocators(NUM_GPUS)`. + +Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`. diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 709f7de2e43093114d096cbfca5b5d49293a6d3e..a719da2560291dbc7e98aadfae41d4692d8afcad 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -9,6 +9,7 @@ function(op_library TARGET) set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE) set(cc_srcs) set(cu_srcs) + set(cu_cc_srcs) set(op_common_deps operator op_registry math_function) set(options "") set(oneValueArgs "") @@ -22,6 +23,9 @@ function(op_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) list(APPEND cc_srcs ${TARGET}.cc) endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc) + list(APPEND cu_cc_srcs ${TARGET}.cu.cc) + endif() if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu) endif() @@ -29,6 +33,8 @@ function(op_library TARGET) foreach(src ${op_library_SRCS}) if (${src} MATCHES ".*\\.cu$") list(APPEND cu_srcs ${src}) + elseif(${src} MATCHES ".*\\.cu.cc$") + list(APPEND cu_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cc$") list(APPEND cc_srcs ${src}) else() @@ -43,7 +49,7 @@ function(op_library TARGET) endif() if (WITH_GPU) - nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} + nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) else() cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS} @@ -140,7 +146,9 @@ function(op_library TARGET) # pybind USE_CPU_ONLY_OP list(LENGTH cu_srcs cu_srcs_len) - if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0) + list(LENGTH cu_cc_srcs cu_cc_srcs_len) + + if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0) file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") set(pybind_flag 1) endif() @@ -160,11 +168,12 @@ set(DEPS_OPS recurrent_op dynamic_recurrent_op softmax_with_cross_entropy_op + softmax_op + sequence_softmax_op sum_op pool_op pool_with_index_op conv_op - lstm_op conv_transpose_op nccl_op sequence_conv_op @@ -174,13 +183,20 @@ set(DEPS_OPS array_to_lod_tensor_op lstm_op tensor_array_read_write_op - gru_op) + gru_op + adagrad_op + sgd_op) + op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cross_entropy_op DEPS cross_entropy) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) +op_library(softmax_op DEPS softmax) +op_library(sequence_softmax_op DEPS softmax) +op_library(sum_op DEPS selected_rows_functor) +op_library(sgd_op DEPS selected_rows_functor) +op_library(adagrad_op DEPS selected_rows_functor) op_library(conv_op DEPS vol2col) -op_library(sum_op DEPS net_op selected_rows_functor) op_library(pool_op DEPS pooling) op_library(pool_with_index_op DEPS pooling) op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) @@ -220,6 +236,6 @@ cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc rnn/recurrent_op_utils.cc DEPS dynamic_recurrent_op) if(WITH_GPU) - nv_test(nccl_op_test SRCS nccl_op_test.cu DEPS nccl_op gpu_info device_context) + cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) endif() cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc index 8d1a2b7938d2c6607cbeb3cecb72d1d5b83dd8b9..d6686e3ef3165976cf4c077a7a0f213082aa7716 100644 --- a/paddle/operators/adagrad_op.cc +++ b/paddle/operators/adagrad_op.cc @@ -14,6 +14,11 @@ limitations under the License. */ #include "paddle/operators/adagrad_op.h" +#include + +#include "paddle/operators/math/math_function.h" +#include "paddle/operators/math/selected_rows_functor.h" + namespace paddle { namespace operators { @@ -21,7 +26,7 @@ class AdagradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Param"), "Input(Param) of AdagradOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Grad"), @@ -54,8 +59,8 @@ class AdagradOp : public framework::OperatorWithKernel { class AdagradOpMaker : public framework::OpProtoAndCheckerMaker { public: - AdagradOpMaker(framework::OpProto *proto, - framework::OpAttrChecker *op_checker) + AdagradOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("Param", "(Tensor) Input parameter"); AddInput("Grad", "(Tensor) Input gradient"); @@ -87,10 +92,85 @@ for numerical stability to avoid the division by zero error. )DOC"); } }; + +namespace { +size_t FindPos(const std::vector& rows, int64_t value) { + return std::find(rows.begin(), rows.end(), value) - rows.begin(); +} +} // namespace + +template +struct SparseAdagradFunctor { + void operator()(const platform::DeviceContext& context, + const framework::SelectedRows& grad, + const framework::Tensor& learning_rate, T epsilon, + framework::Tensor* moment, framework::Tensor* param) { + // 1. g_m.rows = set(g.rows) + auto grad_rows = grad.rows(); + std::set row_set(grad_rows.begin(), grad_rows.end()); + std::vector merge_rows(row_set.begin(), row_set.end()); + + auto grad_width = grad.value().dims()[1]; + std::unique_ptr grad_merge{ + new framework::SelectedRows()}; + grad_merge->set_rows(merge_rows); + grad_merge->set_height(grad.height()); + grad_merge->mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), grad_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, grad_merge->mutable_value(), 0.0); + + auto* grad_merge_data = grad_merge->mutable_value()->data(); + auto* grad_data = grad.value().data(); + + for (size_t i = 0; i < grad_rows.size(); i++) { + size_t grad_merge_i = FindPos(merge_rows, grad_rows[i]); + for (int64_t j = 0; j < grad_width; j++) { + grad_merge_data[grad_merge_i * grad_width + j] += + grad_data[i * grad_width + j]; + } + } + + // 2. m += g_m * g_m + std::unique_ptr grad_square{ + new framework::SelectedRows()}; + grad_square->set_rows(grad_merge->rows()); + grad_square->set_height(grad_merge->height()); + grad_square->mutable_value()->mutable_data(grad_merge->value().dims(), + context.GetPlace()); + auto gs = + framework::EigenVector::Flatten(*(grad_square->mutable_value())); + auto gm = framework::EigenVector::Flatten(grad_merge->value()); + gs.device(*context.GetEigenDevice()) = gm * gm; + + math::SelectedRowsAddToTensor functor; + functor(context, *grad_square, moment); + + // 3. update parameter + auto* lr = learning_rate.data(); + auto* param_data = param->data(); + auto* moment_data = moment->data(); + + for (size_t i = 0; i < merge_rows.size(); i++) { + for (int64_t j = 0; j < grad_width; j++) { + param_data[merge_rows[i] * grad_width + j] -= + lr[0] * grad_merge_data[i * grad_width + j] / + (std::sqrt(moment_data[merge_rows[i] * grad_width + j]) + epsilon); + } + } + } +}; + +template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker); -REGISTER_OP_CPU_KERNEL(adagrad, - ops::AdagradOpKernel); +REGISTER_OP_CPU_KERNEL( + adagrad, ops::AdagradOpKernel, + ops::AdagradOpKernel); diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu index a5b7951121360f78612f9008a522235104708112..5b869e6bc5f4604ba6055ffd62fa21e4a1f41b93 100644 --- a/paddle/operators/adagrad_op.cu +++ b/paddle/operators/adagrad_op.cu @@ -14,7 +14,138 @@ #define EIGEN_USE_GPU #include "paddle/operators/adagrad_op.h" +#include "paddle/operators/math/selected_rows_functor.h" +#include "paddle/operators/math/math_function.h" +#include "paddle/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +namespace { + +template +__global__ void MergeGradKernel(const T* grad, const int64_t* grad_rows, + T* grad_merge, const int64_t* grad_merge_rows, + size_t grad_merge_rows_size, + int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + __shared__ size_t grad_merge_idx; + + if (tid == 0) { + for (size_t i = 0; i < grad_merge_rows_size; i++) { + if (grad_rows[ty] == grad_merge_rows[i]) { + grad_merge_idx = i; + } + } + } + + __syncthreads(); + + grad += ty * row_numel; + grad_merge += grad_merge_idx * row_numel; + for (int index = tid; index < row_numel; index += block_size) { + paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]); + } +} + +template +__global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows, + const T* learning_rate, T* param, + T* moment, int64_t row_numel, + T epsilon) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + grad += ty * row_numel; + param += rows[ty] * row_numel; + moment += rows[ty] * row_numel; + + for (int index = tid; index < row_numel; index += block_size) { + // Since index in rows of SelectedRows can be duplicate, we have to use + // Atomic Operation to avoid concurrent write error. + paddle::platform::CudaAtomicAdd(param + index, + -1.0 * learning_rate[0] * grad[index] / + (sqrt(moment[index]) + epsilon)); + } +} +} // namespace + +template +struct SparseAdagradFunctor { + void operator()(const platform::DeviceContext& context, + const framework::SelectedRows& grad, + const framework::Tensor& learning_rate, T epsilon, + framework::Tensor* moment, framework::Tensor* param) { + // 1. g_m.rows = set(g.rows) + auto grad_rows = grad.rows(); + std::set row_set(grad_rows.begin(), grad_rows.end()); + std::vector merge_rows(row_set.begin(), row_set.end()); + + auto grad_width = grad.value().dims()[1]; + std::unique_ptr grad_merge{ + new framework::SelectedRows()}; + grad_merge->set_rows(merge_rows); + grad_merge->set_height(grad.height()); + grad_merge->mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), grad_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, grad_merge->mutable_value(), 0.0); + + auto* grad_merge_data = grad_merge->mutable_value()->data(); + auto* grad_data = grad.value().data(); + + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid1(1, grad_rows.size()); + + MergeGradKernel< + T, 256><<(context) + .stream()>>>(grad_data, grad.rows().data(), + grad_merge_data, grad_merge->rows().data(), + grad_merge->rows().size(), grad_width); + + // 2. m += g_m * g_m + std::unique_ptr grad_square{ + new framework::SelectedRows()}; + grad_square->set_rows(grad_merge->rows()); + grad_square->set_height(grad_merge->height()); + grad_square->mutable_value()->mutable_data(grad_merge->value().dims(), + context.GetPlace()); + auto gs = + framework::EigenVector::Flatten(*(grad_square->mutable_value())); + auto gm = framework::EigenVector::Flatten(grad_merge->value()); + gs.device(*context.GetEigenDevice()) = gm * gm; + + math::SelectedRowsAddToTensor functor; + functor(context, *grad_square, moment); + + // 3. update parameter + auto* lr = learning_rate.data(); + auto* param_data = param->data(); + auto* moment_data = moment->data(); + + dim3 grid2(1, merge_rows.size()); + SparseAdagradFunctorKernel< + T, 256><<(context) + .stream()>>>(grad_merge_data, grad_merge->rows().data(), + lr, param_data, + moment_data, grad_width, epsilon); + } +}; + +template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; + +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(adagrad, - ops::AdagradOpKernel); +REGISTER_OP_GPU_KERNEL( + adagrad, ops::AdagradOpKernel, + ops::AdagradOpKernel); diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h index c5d8f751d3527f89b96d4274328ba0bb5f6efa44..4d4a6434c7c472d8ceb01edfc4050fbb009d6c9f 100644 --- a/paddle/operators/adagrad_op.h +++ b/paddle/operators/adagrad_op.h @@ -19,35 +19,59 @@ limitations under the License. */ namespace paddle { namespace operators { +template +struct SparseAdagradFunctor { + void operator()(const platform::DeviceContext& context, + const framework::SelectedRows& grad, + const framework::Tensor& learning_rate, T epsilon, + framework::Tensor* moment, framework::Tensor* param); +}; + template class AdagradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto param_out_tensor = ctx.Output("ParamOut"); - auto moment_out_tensor = ctx.Output("MomentOut"); + auto* param_out_tensor = ctx.Output("ParamOut"); + auto* moment_out_tensor = ctx.Output("MomentOut"); param_out_tensor->mutable_data(ctx.GetPlace()); moment_out_tensor->mutable_data(ctx.GetPlace()); - float epsilon = ctx.Attr("epsilon"); - - auto param = framework::EigenVector::Flatten( - *ctx.Input("Param")); - auto grad = framework::EigenVector::Flatten( - *ctx.Input("Grad")); - auto moment = framework::EigenVector::Flatten( - *ctx.Input("Moment")); - auto lr = framework::EigenVector::Flatten( - *ctx.Input("LearningRate")); - - auto param_out = framework::EigenVector::Flatten(*param_out_tensor); - auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); - auto place = ctx.GetEigenDevice(); - - moment_out.device(place) = moment + grad * grad; - Eigen::DSizes m_dsize(moment_out_tensor->numel()); - param_out.device(place) = - param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); + T epsilon = static_cast(ctx.Attr("epsilon")); + + auto* grad_var = ctx.InputVar("Grad"); + if (grad_var->IsType()) { + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + auto moment = framework::EigenVector::Flatten( + *ctx.Input("Moment")); + auto lr = framework::EigenVector::Flatten( + *ctx.Input("LearningRate")); + + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); + auto place = ctx.GetEigenDevice(); + + moment_out.device(place) = moment + grad * grad; + Eigen::DSizes m_dsize(moment_out_tensor->numel()); + param_out.device(place) = + param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); + } else if (grad_var->IsType()) { + auto* param_tensor = ctx.Input("Param"); + PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor); + + auto* moment_tensor = ctx.Input("Moment"); + PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor); + + SparseAdagradFunctor functor; + functor(ctx.device_context(), *ctx.Input("Grad"), + *ctx.Input("LearningRate"), epsilon, + moment_out_tensor, param_out_tensor); + } else { + PADDLE_THROW("Unsupported Variable Type of Grad"); + } } }; diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h index 666043e824f885e9c0e79e319d0a38ba108c209a..233a81198e336d3190565fb18556f96979cec0ce 100644 --- a/paddle/operators/array_operator.h +++ b/paddle/operators/array_operator.h @@ -42,6 +42,7 @@ class ArrayOp : public framework::OperatorBase { } else { offset = static_cast(*i_tensor.data()); } + VLOG(10) << " Offset = " << offset; return offset; } }; diff --git a/paddle/operators/batch_norm_op.cu b/paddle/operators/batch_norm_op.cu.cc similarity index 100% rename from paddle/operators/batch_norm_op.cu rename to paddle/operators/batch_norm_op.cu.cc diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h index ffa4f43a327418498c1f110504127e7d2878409d..1113a4c6f357edb4f6b14b73c6eec9c6cca24ce5 100644 --- a/paddle/operators/bilinear_tensor_product_op.h +++ b/paddle/operators/bilinear_tensor_product_op.h @@ -174,7 +174,7 @@ class BilinearTensorProductGradKernel : public framework::OpKernel { // Caculate the gradient of Input(Bias). if (d_bias) { d_bias->mutable_data(ctx.GetPlace()); - auto d_bias_mat = EigenMatrix::From(*d_bias); + auto d_bias_mat = framework::EigenVector::Flatten(*d_bias); d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes(0)); } } diff --git a/paddle/operators/concat_op.cu b/paddle/operators/concat_op.cu.cc similarity index 100% rename from paddle/operators/concat_op.cu rename to paddle/operators/concat_op.cu.cc diff --git a/paddle/operators/conv2d_transpose_cudnn_op.cu b/paddle/operators/conv2d_transpose_cudnn_op.cu.cc similarity index 96% rename from paddle/operators/conv2d_transpose_cudnn_op.cu rename to paddle/operators/conv2d_transpose_cudnn_op.cu.cc index 694526ec01214acf2ec6a3d68d3cf072739ac185..eff058afc6cc5dacf2a054a33f352824865c1924 100644 --- a/paddle/operators/conv2d_transpose_cudnn_op.cu +++ b/paddle/operators/conv2d_transpose_cudnn_op.cu.cc @@ -200,9 +200,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { T alpha = 1.0f, beta = 0.0f; if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - auto t = framework::EigenVector::Flatten(*input_grad); - t.device(ctx.GetEigenDevice()) = - t.constant(static_cast(0)); + math::set_constant(ctx.device_context(), input_grad, 0); PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( handle, &alpha, cudnn_output_desc, output_grad_data, @@ -214,9 +212,8 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { // ------------------- cudnn conv backward filter --------------------- if (filter_grad) { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); - auto t = framework::EigenVector::Flatten(*filter_grad); - t.device(ctx.GetEigenDevice()) = - t.constant(static_cast(0)); + math::set_constant(ctx.device_context(), filter_grad, 0); + // Gradient with respect to the filter PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc, diff --git a/paddle/operators/conv_cudnn_op.cu b/paddle/operators/conv_cudnn_op.cu.cc similarity index 100% rename from paddle/operators/conv_cudnn_op.cu rename to paddle/operators/conv_cudnn_op.cu.cc diff --git a/paddle/operators/conv_op.cu b/paddle/operators/conv_op.cu.cc similarity index 100% rename from paddle/operators/conv_op.cu rename to paddle/operators/conv_op.cu.cc diff --git a/paddle/operators/conv_shift_op.cu b/paddle/operators/conv_shift_op.cu index 74ed1b0ed358afc4f1a4e6a0c322eb032029d551..95e13c38a8dd234f49393d2d4808607a447b0d4c 100644 --- a/paddle/operators/conv_shift_op.cu +++ b/paddle/operators/conv_shift_op.cu @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/operators/conv_shift_op.h" +#include "paddle/operators/math/math_function.h" #include "paddle/platform/cuda_helper.h" namespace paddle { @@ -22,7 +23,7 @@ using framework::Tensor; namespace { -inline int div_up(int x, int y) { return (x + y - 1) / y; } +inline int DivUp(int x, int y) { return (x + y - 1) / y; } // Some notes on the design: // @@ -33,9 +34,9 @@ inline int div_up(int x, int y) { return (x + y - 1) / y; } // y is fairly small. For large y, it would probably be more efficient // to also tile across y. template -__global__ void conv_shift_forward(const T *x, const T *y, T *out, int x_width, - int y_width, int y_half_width, - int batch_size) { +__global__ void ConvShiftForward(const T *x, const T *y, int x_width, + int y_width, int y_half_width, int batch_size, + T *out) { extern __shared__ T mem[]; int tx = threadIdx.x; @@ -62,25 +63,26 @@ __global__ void conv_shift_forward(const T *x, const T *y, T *out, int x_width, if (tx < num_x) { int load_i = (i - y_half_width + x_width) % x_width; sx[tx] = x[k * x_width + load_i]; - } else { - return; } __syncthreads(); - // Compute dot product of sx[tx:tx + y_width] and sy. - T sum = 0; - for (int j = 0; j < y_width; ++j) { - sum += sx[tx + j] * sy[j]; - } + if (tx < num_x) { + // Compute dot product of sx[tx:tx + y_width] and sy. + T sum = 0; + for (int j = 0; j < y_width; ++j) { + sum += sx[tx + j] * sy[j]; + } - // Save to out[k, i]. - out[k * x_width + i] = sum; + // Save to out[k, i]. + out[k * x_width + i] = sum; + } } // Compute x gradient - initial naive implementation with atomic add. template -__global__ void conv_shift_dx(const T *dout, const T *y, T *dx, int x_width, - int y_width, int y_half_width, int batch_size) { +__global__ void ConvShiftGradX(const T *dout, const T *y, int x_width, + int y_width, int y_half_width, int batch_size, + T *dx) { int i = blockIdx.x * blockDim.x + threadIdx.x; // x index int j = blockIdx.y; // y index int k = blockIdx.z; // batch index @@ -94,8 +96,8 @@ __global__ void conv_shift_dx(const T *dout, const T *y, T *dx, int x_width, // Compute y gradient - initial naive implementation with atomic add. template -__global__ void conv_shift_dy(const T *x, const T *dout, T *dy, int x_width, - int y_width, int y_half_width, int batch_size) { +__global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width, + int y_half_width, int batch_size, T *dy) { int i = blockIdx.x * blockDim.x + threadIdx.x; // x index int j = blockIdx.y; // y index int k = blockIdx.z; // batch index @@ -125,15 +127,15 @@ class ConvShiftKernel : public framework::OpKernel { int y_half_width = (y_width - 1) / 2; const int x_per_block = 256; - int num_x_blocks = div_up(x_width, x_per_block); + int num_x_blocks = DivUp(x_width, x_per_block); int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T); dim3 grid_dim(num_x_blocks, batch_size); auto stream = context.cuda_device_context().stream(); - conv_shift_forward<<>>( - x_data, y_data, out_data, x_width, y_width, y_half_width, batch_size); + ConvShiftForward<<>>( + x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data); } }; @@ -157,25 +159,26 @@ class ConvShiftGradKernel int y_width = Y->dims()[1]; int y_half_width = (y_width - 1) / 2; - auto stream = context.cuda_device_context().stream(); + auto &device_ctx = context.cuda_device_context(); + math::SetConstant zero; const int x_per_block = 256; - int num_x_blocks = div_up(x_width, x_per_block); + int num_x_blocks = DivUp(x_width, x_per_block); dim3 grid_dim(num_x_blocks, y_width, batch_size); if (dX) { T *dx_data = dX->mutable_data(context.GetPlace()); - cudaMemsetAsync(dx_data, 0, dX->numel() * sizeof(T), stream); - conv_shift_dx<<>>( - dout_data, y_data, dx_data, x_width, y_width, y_half_width, - batch_size); + zero(device_ctx, dX, static_cast(0.0)); + ConvShiftGradX<<>>( + dout_data, y_data, x_width, y_width, y_half_width, batch_size, + dx_data); } if (dY) { T *dy_data = dY->mutable_data(context.GetPlace()); - cudaMemsetAsync(dy_data, 0, dY->numel() * sizeof(T), stream); - conv_shift_dy<<>>( - x_data, dout_data, dy_data, x_width, y_width, y_half_width, - batch_size); + zero(device_ctx, dY, static_cast(0.0)); + ConvShiftDy<<>>( + x_data, dout_data, x_width, y_width, y_half_width, batch_size, + dy_data); } } }; diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc index 13ac0cd54cbeb8f68c2246f7e1d02f032266a72e..310e3f5c937bd1345663b2c2307610a485a027ef 100644 --- a/paddle/operators/conv_transpose_op.cc +++ b/paddle/operators/conv_transpose_op.cc @@ -30,11 +30,6 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); - for (size_t i = 0; i < paddings.size(); ++i) { - PADDLE_ENFORCE_EQ(paddings[i], 0, - "No Padding allowed in conv transpose op."); - } - PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, "ConvTransposeOp intput should be 4-D or 5-D tensor."); PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(), @@ -52,7 +47,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { std::vector output_shape({in_dims[0], filter_dims[1]}); for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back((in_dims[i + 2] - 1) * strides[i] + + output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] + filter_dims[i + 2]); } ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); diff --git a/paddle/operators/conv_transpose_op.cu b/paddle/operators/conv_transpose_op.cu.cc similarity index 100% rename from paddle/operators/conv_transpose_op.cu rename to paddle/operators/conv_transpose_op.cu.cc diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h index 4b2bd60437da8f58054d8cdd5e6ba1fdac05f0d5..ab336ad23ce1c180b68d04e4c85b299e301d5376 100644 --- a/paddle/operators/conv_transpose_op.h +++ b/paddle/operators/conv_transpose_op.h @@ -62,7 +62,6 @@ class GemmConvTransposeKernel : public framework::OpKernel { Tensor* output = context.Output("Output"); std::vector strides = context.Attr>("strides"); - // Actually, no paddings and groups allowed in conv transpose. std::vector paddings = context.Attr>("paddings"); // TODO(Zhuoyuan): Paddings can be added in future. // groups will alway be disabled in conv2dtranspose. @@ -148,8 +147,8 @@ class GemmConvTransposeKernel : public framework::OpKernel { } else if (filter_shape_vec.size() == 3) { // col2vol: col_matrix -> dy // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w) - col2vol(context.device_context(), col, dilations, strides, - std::vector{0, 0, 0}, &output_batch); + col2vol(context.device_context(), col, dilations, strides, paddings, + &output_batch); } } } @@ -173,7 +172,6 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { if ((!input_grad) && (!filter_grad)) return; std::vector strides = context.Attr>("strides"); - // Actually, no paddings and groups allowed in conv transpose. std::vector paddings = context.Attr>("paddings"); const int batch_size = static_cast(input->dims()[0]); diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index 68c56f531f941e1b8f66ac7ba6bf318881642c4f..62a4e484eceeabc4cc26e68ac54a50be1ac95df7 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -132,7 +132,7 @@ class CosSimGradKernel : public framework::OpKernel { // compute dy if (out_grad_y) { out_grad_y->mutable_data(context.GetPlace()); - auto dy = EigenMatrix::Reshape(*out_grad_y, 1); + auto dy = EigenVector::Flatten(*out_grad_y); auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast; dy.device(place) = (dz_bcast * grad).sum(Eigen::array({{0}})); } diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu index 530b319a44eac915f0d49eb55bfe5929908eab26..6212e39dfde33c5943958adbd1a0a052262e119e 100644 --- a/paddle/operators/cross_entropy_op.cu +++ b/paddle/operators/cross_entropy_op.cu @@ -23,8 +23,6 @@ template __global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X, const int64_t* label, const int N, const int D) { - // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file. - // CUDA_1D_KERNEL_LOOP(i, N) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { int idx = i * D + label[i]; diff --git a/paddle/operators/detail/safe_ref.h b/paddle/operators/detail/safe_ref.h new file mode 100644 index 0000000000000000000000000000000000000000..b71af17309f9f46b5c87f0f479d4e03443fa7f93 --- /dev/null +++ b/paddle/operators/detail/safe_ref.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +namespace paddle { +namespace operators { +namespace detail { +/** + * Get Reference From Pointer with check. The error message is printf format, + * and passed by `args` + */ +template +inline T &Ref(T *ptr, ARGS &&... args) { + PADDLE_ENFORCE(ptr != nullptr, args...); + return *ptr; +} +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc index 85871ebbfcd8ee38ef5e8078d1d6cb6bdda46a7b..985b5d1e865e513d833bff72dcd20a8f20851d8c 100644 --- a/paddle/operators/fill_constant_batch_size_like_op.cc +++ b/paddle/operators/fill_constant_batch_size_like_op.cc @@ -101,4 +101,7 @@ REGISTER_OPERATOR(fill_constant_batch_size_like, REGISTER_OP_CPU_KERNEL( fill_constant_batch_size_like, ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel); + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel); diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu b/paddle/operators/fill_constant_batch_size_like_op.cu.cc similarity index 81% rename from paddle/operators/fill_constant_batch_size_like_op.cu rename to paddle/operators/fill_constant_batch_size_like_op.cu.cc index 298c196f1dfef388640e34153264986bd518a11a..9e7a1eeab863c962ca72908e561e12a04d5021c5 100644 --- a/paddle/operators/fill_constant_batch_size_like_op.cu +++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc @@ -12,11 +12,14 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/framework/op_registry.h" #include "paddle/operators/fill_constant_batch_size_like_op.h" +#include "paddle/framework/op_registry.h" namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL( fill_constant_batch_size_like, ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel); + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel); diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc index 8ab39d4fb012b8fa3883f33e4d15be7918500354..95fb5932b8b555e1357adc9fdfb7b6e6db7da71d 100644 --- a/paddle/operators/fill_zeros_like_op.cc +++ b/paddle/operators/fill_zeros_like_op.cc @@ -54,5 +54,8 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp, ops::FillZerosLikeOpMaker); REGISTER_OP_CPU_KERNEL( - fill_zeros_like, - ops::FillZerosLikeKernel); + fill_zeros_like, ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/operators/fill_zeros_like_op.cu b/paddle/operators/fill_zeros_like_op.cu.cc similarity index 69% rename from paddle/operators/fill_zeros_like_op.cu rename to paddle/operators/fill_zeros_like_op.cu.cc index a6d4ba64bde534ea76867c456537b130a45b9496..1501a17441072223ba0e8cf5b6c8cdd5e903a467 100644 --- a/paddle/operators/fill_zeros_like_op.cu +++ b/paddle/operators/fill_zeros_like_op.cu.cc @@ -12,10 +12,13 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/framework/op_registry.h" #include "paddle/operators/fill_zeros_like_op.h" +#include "paddle/framework/op_registry.h" namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL( - fill_zeros_like, - ops::FillZerosLikeKernel); + fill_zeros_like, ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/operators/gru_op.cu b/paddle/operators/gru_op.cu.cc similarity index 97% rename from paddle/operators/gru_op.cu rename to paddle/operators/gru_op.cu.cc index 35538c74b4bf678f8068999bfadb2589a1671be0..0ceff94ec3ddaadbd5f0ca4f5a4eebe6cb8ee3a9 100644 --- a/paddle/operators/gru_op.cu +++ b/paddle/operators/gru_op.cu.cc @@ -12,7 +12,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -#define EIGEN_USE_GPU #include "paddle/operators/gru_op.h" namespace ops = paddle::operators; diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h index ba90ec9816c40a6a49065ac6efcee6b93dffce90..55e9cc4a98bd6d36ce5d6bb4116039d0ec18b485 100644 --- a/paddle/operators/gru_op.h +++ b/paddle/operators/gru_op.h @@ -27,10 +27,6 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template -using EigenMatrix = framework::EigenMatrix; - template class GRUKernel : public framework::OpKernel { public: @@ -57,19 +53,15 @@ class GRUKernel : public framework::OpKernel { bool is_reverse = context.Attr("is_reverse"); math::LoDTensor2BatchFunctor to_batch; - to_batch(context.device_context(), *input, *batch_gate, true, is_reverse); + auto& dev_ctx = context.device_context(); + to_batch(dev_ctx, *input, *batch_gate, true, is_reverse); - int frame_size = hidden_dims[1]; - int batch_size = hidden_dims[0]; - auto g = EigenMatrix::From(*batch_gate); - auto place = context.GetEigenDevice(); if (bias) { - auto b = EigenMatrix::From(*bias); - g.device(place) = g + - b.reshape(Eigen::array({{1, frame_size * 3}})) - .broadcast(Eigen::array({{batch_size, 1}})); + math::RowwiseAdd add_bias; + add_bias(dev_ctx, *batch_gate, *bias, batch_gate); } + int frame_size = hidden_dims[1]; math::hl_gru_value gru_value; gru_value.gateWeight = const_cast(weight_data); gru_value.stateWeight = @@ -89,7 +81,7 @@ class GRUKernel : public framework::OpKernel { gru_value.gateValue = gate_t.data(); gru_value.resetOutputValue = reset_hidden_prev_t.data(); math::GRUUnitFunctor::compute( - context.device_context(), gru_value, frame_size, cur_batch_size, + dev_ctx, gru_value, frame_size, cur_batch_size, math::ActiveType(context.Attr("activation")), math::ActiveType(context.Attr("gate_activation"))); gru_value.prevOutValue = gru_value.outputValue; @@ -97,7 +89,7 @@ class GRUKernel : public framework::OpKernel { math::Batch2LoDTensorFunctor to_seq; batch_hidden->set_lod(batch_gate->lod()); - to_seq(context.device_context(), *batch_hidden, *hidden); + to_seq(dev_ctx, *batch_hidden, *hidden); } void Compute(const framework::ExecutionContext& context) const override { @@ -138,15 +130,14 @@ class GRUGradKernel : public framework::OpKernel { batch_reset_hidden_prev_grad.mutable_data(hidden_dims, context.GetPlace()); math::SetConstant zero; - zero(context.device_context(), &batch_hidden_grad, static_cast(0.0)); - zero(context.device_context(), &batch_gate_grad, static_cast(0.0)); - zero(context.device_context(), &batch_reset_hidden_prev_grad, - static_cast(0.0)); + auto& dev_ctx = context.device_context(); + zero(dev_ctx, &batch_hidden_grad, static_cast(0.0)); + zero(dev_ctx, &batch_gate_grad, static_cast(0.0)); + zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast(0.0)); bool is_reverse = context.Attr("is_reverse"); batch_hidden_grad.set_lod(batch_hidden->lod()); - to_batch(context.device_context(), *hidden_grad, batch_hidden_grad, false, - is_reverse); + to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse); math::hl_gru_value gru_value; gru_value.gateWeight = const_cast(weight_data); @@ -157,7 +148,7 @@ class GRUGradKernel : public framework::OpKernel { if (weight_grad) { gru_grad.gateWeightGrad = weight_grad->mutable_data(context.GetPlace()); - zero(context.device_context(), weight_grad, static_cast(0.0)); + zero(dev_ctx, weight_grad, static_cast(0.0)); gru_grad.stateWeightGrad = weight_grad->data() + 2 * frame_size * frame_size; } else { @@ -188,7 +179,7 @@ class GRUGradKernel : public framework::OpKernel { gru_value.prevOutValue = const_cast(h0_data); if (h0_grad) { T* h0_grad_data = h0_grad->mutable_data(context.GetPlace()); - zero(context.device_context(), h0_grad, static_cast(0.0)); + zero(dev_ctx, h0_grad, static_cast(0.0)); gru_grad.prevOutGrad = h0_grad_data; } else { gru_grad.prevOutGrad = nullptr; @@ -202,8 +193,7 @@ class GRUGradKernel : public framework::OpKernel { } math::GRUUnitGradFunctor::compute( - context.device_context(), gru_value, gru_grad, frame_size, - cur_batch_size, + dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, math::ActiveType(context.Attr("activation")), math::ActiveType(context.Attr("gate_activation"))); } @@ -211,14 +201,12 @@ class GRUGradKernel : public framework::OpKernel { input_grad->mutable_data(context.GetPlace()); math::Batch2LoDTensorFunctor to_seq; batch_gate_grad.set_lod(batch_gate->lod()); - to_seq(context.device_context(), batch_gate_grad, *input_grad); + to_seq(dev_ctx, batch_gate_grad, *input_grad); } if (bias_grad) { bias_grad->mutable_data(context.GetPlace()); - auto d_b = EigenMatrix::From(*bias_grad); - auto d_g = EigenMatrix::From(batch_gate_grad); - auto place = context.GetEigenDevice(); - d_b.device(place) = d_g.sum(Eigen::array({{0}})); + math::ColwiseSum col_sum; + col_sum(dev_ctx, batch_gate_grad, bias_grad); } } diff --git a/paddle/operators/is_empty_op.cc b/paddle/operators/is_empty_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..54fecf44e881b5c283c81580fd161da9808d253e --- /dev/null +++ b/paddle/operators/is_empty_op.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" + +namespace paddle { +namespace operators { + +constexpr char kInput[] = "X"; +constexpr char kOutput[] = "Out"; + +class IsEmptyOp : public framework::OperatorBase { + public: + IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::DeviceContext &dev_ctx) const override { + // get input + auto *var = scope.FindVar(Input(kInput)); + PADDLE_ENFORCE_NOT_NULL(var); + auto &tensor = var->Get(); + // get output + auto *out = scope.FindVar(Output(kOutput)); + PADDLE_ENFORCE_NOT_NULL(out); + auto *out_tensor = out->GetMutable(); + + out_tensor->Resize({1}); + out_tensor->mutable_data(platform::CPUPlace())[0] = + framework::product(tensor.dims()) == 0; + } +}; + +class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + IsEmptyOpProtoMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput(kInput, "(Tensor) Tensor which is to be checked."); + AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not."); + AddComment(R"DOC( +IsEmpty Operator which checks whether a tensor is empty. + +It will just return product(tensor.ddims()) > 0; + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp, + paddle::operators::IsEmptyOpProtoMaker); diff --git a/paddle/operators/lstm_op.cu b/paddle/operators/lstm_op.cu.cc similarity index 97% rename from paddle/operators/lstm_op.cu rename to paddle/operators/lstm_op.cu.cc index 9ad56941553bf19a56c25f41f76fe20dfa3a106f..610cbb03e890203407b1489800bc17f1a196d12c 100644 --- a/paddle/operators/lstm_op.cu +++ b/paddle/operators/lstm_op.cu.cc @@ -12,7 +12,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -#define EIGEN_USE_GPU #include "paddle/operators/lstm_op.h" namespace ops = paddle::operators; diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h index fca84e2d8fa832a3780eab7e0fa2facceb4d613b..721aa42c92f2926aabbc13d0a9027b2b4e573225 100644 --- a/paddle/operators/lstm_op.h +++ b/paddle/operators/lstm_op.h @@ -24,10 +24,6 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; -template -using EigenMatrix = framework::EigenMatrix; - template inline void ReorderInitState(const platform::DeviceContext& ctx, const framework::Tensor& src, const size_t* index, @@ -65,16 +61,11 @@ class LSTMKernel : public framework::OpKernel { framework::DDim dims({in_dims[0], frame_size}); if (bias) { - Eigen::array extents({{1, 4 * frame_size}}); - Eigen::array offsets({{0, 0}}); - auto b = EigenMatrix::From(*bias); - auto gate = EigenMatrix::From(*batch_gate); - gate.device(ctx.GetEigenDevice()) = - gate + - b.slice(offsets, extents) - .reshape(Eigen::array({{1, frame_size * 4}})) - .broadcast( - Eigen::array({{static_cast(in_dims[0]), 1}})); + Tensor b = *bias; + b.Resize({bias->numel(), 1}); + Tensor gate_bias = b.Slice(0, 4 * frame_size); + math::RowwiseAdd add_bias; + add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); } math::LstmMetaValue lstm_value; @@ -350,16 +341,11 @@ class LSTMGradKernel : public framework::OpKernel { } if (bias && bias_g) { /* backward bias */ - int m = static_cast(batch_gate_g.dims()[0]); - int n = static_cast(batch_gate_g.dims()[1]); - - Tensor ones; - ones.mutable_data({m}, ctx.GetPlace()); - math::SetConstant set; - set(device_ctx, &ones, static_cast(1.0)); - - math::gemv(device_ctx, true, m, n, 1., batch_gate_g.data(), - ones.data(), 0., bias_g->data()); + Tensor b_g = *bias_g; + b_g.Resize({bias_g->numel(), 1}); + Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size); + math::ColwiseSum col_sum; + col_sum(device_ctx, batch_gate_g, &gate_bias_g); } if (h0 && h0_g) { diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index ab7f23f57043844d45c36acc475422613164bee1..002b68fecf4f1e294387357f0346d9926a2b2b5a 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -1,28 +1,28 @@ add_subdirectory(detail) if(WITH_GPU) - nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context operator) + nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto) nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor) nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function) nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor) - nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator) - nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator) + nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context) + nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) - nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context) + nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function) else() - cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator) + cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto) cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) - cc_library(softmax SRCS softmax.cc DEPS operator) - cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) + cc_library(softmax SRCS softmax.cc DEPS device_context) + cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context) cc_library(pooling SRCS pooling.cc DEPS device_context) cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function) cc_library(vol2col SRCS vol2col.cc DEPS device_context) - cc_library(context_project SRCS context_project.cc DEPS device_context) + cc_library(context_project SRCS context_project.cc DEPS device_context math_function) cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function) diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h index 845de82bbcb33d52184f04ae1594738cb4776eca..72f4202bace4461d2597204feaa2a21e355bd1ac 100644 --- a/paddle/operators/math/context_project.h +++ b/paddle/operators/math/context_project.h @@ -14,9 +14,9 @@ limitations under the License. */ #pragma once -#include "paddle/framework/eigen.h" #include "paddle/framework/lod_tensor.h" #include "paddle/operators/math/im2col.h" +#include "paddle/operators/math/math_function.h" namespace paddle { namespace operators { @@ -24,9 +24,6 @@ namespace math { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template -using EigenMatrix = framework::EigenMatrix; /* * \brief Context projection concatenates features in adjacent time-steps in @@ -152,9 +149,7 @@ class ContextProjectFunctor { Tensor out_t_sub = out_t.Slice(k * context_length, k * context_length + padding_size); Tensor w_sub = padding_data.Slice(k, k + padding_size); - auto out_t_sub_e = EigenMatrix::From(out_t_sub); - auto w_sub_e = EigenMatrix::From(w_sub); - out_t_sub_e.device(*context.GetEigenDevice()) = w_sub_e; + out_t_sub.CopyFrom(w_sub, context.GetPlace(), context); } } if (down_pad > 0) { // add down pad @@ -184,9 +179,7 @@ class ContextProjectFunctor { (down_pad_begin_row + t) * context_length); Tensor w_sub = padding_data.Slice( up_pad + padding_idx, up_pad + padding_idx + padding_size); - auto out_t_sub_e = EigenMatrix::From(out_t_sub); - auto w_sub_e = EigenMatrix::From(w_sub); - out_t_sub_e.device(*context.GetEigenDevice()) = w_sub_e; + out_t_sub.CopyFrom(w_sub, context.GetPlace(), context); } } out_t.Resize({sequence_height, context_length * sequence_width}); @@ -265,10 +258,8 @@ class ContextProjectGradFunctor { Tensor out_t_sub = out_t.Slice(k * context_length, k * context_length + padding_size); Tensor w_sub = padding_data->Slice(k, k + padding_size); - auto out_t_sub_e = EigenMatrix::From(out_t_sub); - auto w_sub_e = EigenMatrix::From(w_sub); - w_sub_e.device(*context.GetEigenDevice()) = - w_sub_e + out_t_sub_e; + axpy(context, w_sub.numel(), static_cast(1), + out_t_sub.data(), w_sub.data()); } } if (down_pad > 0) { @@ -299,10 +290,8 @@ class ContextProjectGradFunctor { (down_pad_begin_row + t) * context_length); Tensor w_sub = padding_data->Slice( up_pad + padding_idx, up_pad + padding_idx + padding_size); - auto out_t_sub_e = EigenMatrix::From(out_t_sub); - auto w_sub_e = EigenMatrix::From(w_sub); - w_sub_e.device(*context.GetEigenDevice()) = - w_sub_e + out_t_sub_e; + axpy(context, w_sub.numel(), static_cast(1), + out_t_sub.data(), w_sub.data()); } } out_t.Resize({sequence_height, context_length * sequence_width}); diff --git a/paddle/operators/math/cross_entropy.h b/paddle/operators/math/cross_entropy.h index 0ab6827ffa8f8b90b432a801607a97206e010cf4..70ed9ddd551bb8cb7989727c02fea870186c9f2e 100644 --- a/paddle/operators/math/cross_entropy.h +++ b/paddle/operators/math/cross_entropy.h @@ -14,7 +14,6 @@ #pragma once #include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" #include "paddle/framework/tensor.h" #include "paddle/platform/hostdevice.h" diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu index 347df7a0ffdec163c0479a71ec775a813930ba5f..bf7894243919571c2ab15d53690b1ef05bfcc6ee 100644 --- a/paddle/operators/math/im2col.cu +++ b/paddle/operators/math/im2col.cu @@ -119,8 +119,8 @@ __global__ void col2im(int n, const T* data_col, int im_height, int im_width, if (index < n) { T val = 0; - int w = index % im_width; - int h = (index / im_width) % im_height; + int w = index % im_width + padding_width; + int h = (index / im_width) % im_height + padding_height; int c = index / (im_width * im_height); // compute the start and end of the output diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index 1b0d4c8bdc683b5203a4bc4b3838560cffe00bc8..2e333a8cde721f8e65dbf2cf5e3aac6272172cc0 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/operators/math/math_function.h" #include "paddle/framework/data_type.h" +#include "paddle/operators/math/math_function_impl.h" namespace paddle { namespace operators { @@ -232,7 +233,36 @@ void gemv(const platform::DeviceContext& context, cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); } +template <> +void axpy(const platform::DeviceContext& context, + const int n, const float alpha, + const float* x, float* y) { + cblas_saxpy(n, alpha, x, 1, y, 1); +} + +template <> +void axpy(const platform::DeviceContext& context, + const int n, const double alpha, + const double* x, double* y) { + cblas_daxpy(n, alpha, x, 1, y, 1); +} + template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; + +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; + +DEFINE_CPU_TRANS(1); +DEFINE_CPU_TRANS(2); +DEFINE_CPU_TRANS(3); +DEFINE_CPU_TRANS(4); +DEFINE_CPU_TRANS(5); +DEFINE_CPU_TRANS(6); struct TensorSetConstantCPU { TensorSetConstantCPU(framework::Tensor* tensor, float value) @@ -280,6 +310,11 @@ void set_constant(const platform::DeviceContext& context, #endif } +template struct RowwiseAdd; +template struct RowwiseAdd; +template struct ColwiseSum; +template struct ColwiseSum; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 817deec94314bdfd2ed7e4b0ba5212c72b813455..58356a4b7783241ca0292829bf05dc1a8ed80c6c 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#define EIGEN_USE_GPU #include "paddle/framework/data_type.h" #include "paddle/operators/math/math_function.h" +#include "paddle/operators/math/math_function_impl.h" namespace paddle { namespace operators { @@ -231,11 +233,46 @@ void gemv(const platform::DeviceContext& context, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1)); } +template <> +void axpy(const platform::DeviceContext& context, + const int n, const float alpha, + const float* x, float* y) { + PADDLE_ENFORCE(platform::dynload::cublasSaxpy( + reinterpret_cast(context) + .cublas_handle(), + n, &alpha, x, 1, y, 1)); +} + +template <> +void axpy(const platform::DeviceContext& context, + const int n, const double alpha, + const double* x, double* y) { + PADDLE_ENFORCE(platform::dynload::cublasDaxpy( + reinterpret_cast(context) + .cublas_handle(), + n, &alpha, x, 1, y, 1)); +} + template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; + +#define DEFINE_GPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; + +DEFINE_GPU_TRANS(1); +DEFINE_GPU_TRANS(2); +DEFINE_GPU_TRANS(3); +DEFINE_GPU_TRANS(4); +DEFINE_GPU_TRANS(5); +DEFINE_GPU_TRANS(6); struct TensorSetConstantGPU { TensorSetConstantGPU(const platform::DeviceContext& context, - framework::Tensor* tensor, float value) + framework::Tensor* tensor, float value) : context_(context), tensor_(tensor), value_(value) {} template @@ -257,6 +294,11 @@ void set_constant_with_place( TensorSetConstantGPU(context, tensor, value)); } +template struct RowwiseAdd; +template struct RowwiseAdd; +template struct ColwiseSum; +template struct ColwiseSum; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h index c2aaa1d7b7e920c3e6fd9ae4424eae725c3b7c0e..ffb99f53808c4316ede96b04e57aec4dae4134de 100644 --- a/paddle/operators/math/math_function.h +++ b/paddle/operators/math/math_function.h @@ -93,14 +93,21 @@ void gemv(const platform::DeviceContext& context, const bool trans_a, const int M, const int N, const T alpha, const T* A, const T* B, const T beta, T* C); +template +void axpy(const platform::DeviceContext& context, const int n, const T alpha, + const T* x, T* y); + +template +struct Transpose { + void operator()(const platform::DeviceContext& context, + const framework::Tensor& in, framework::Tensor* out, + const std::vector& axis); +}; + template struct SetConstant { void operator()(const platform::DeviceContext& context, - framework::Tensor* tensor, T num) { - auto t = framework::EigenVector::Flatten(*tensor); - t.device(*context.GetEigenDevice()) = - t.constant(static_cast(num)); - } + framework::Tensor* tensor, T num); }; template @@ -110,6 +117,19 @@ void set_constant_with_place(const platform::DeviceContext& context, void set_constant(const platform::DeviceContext& context, framework::Tensor* tensor, float value); +template +struct RowwiseAdd { + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, const framework::Tensor& vec, + framework::Tensor* output); +}; + +template +struct ColwiseSum { + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, framework::Tensor* vec); +}; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..4dc17a4e525c52b8f696277274a7ad00a6b00a08 --- /dev/null +++ b/paddle/operators/math/math_function_impl.h @@ -0,0 +1,83 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/data_type.h" +#include "paddle/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +template +void SetConstant::operator()(const platform::DeviceContext& context, + framework::Tensor* tensor, T num) { + auto t = framework::EigenVector::Flatten(*tensor); + t.device(*context.GetEigenDevice()) = t.constant(static_cast(num)); +} + +template +void Transpose::operator()( + const platform::DeviceContext& context, const framework::Tensor& in, + framework::Tensor* out, const std::vector& axis) { + Eigen::array permute; + for (int i = 0; i < Rank; i++) { + permute[i] = axis[i]; + } + auto in_dim = in.dims(); + auto out_dim = out->dims(); + + auto eigen_in = framework::EigenTensor::From(in); + auto eigen_out = framework::EigenTensor::From(*out); + auto* dev = context.GetEigenDevice(); + eigen_out.device(*dev) = eigen_in.shuffle(permute); +} + +template +void RowwiseAdd::operator()(const platform::DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& vector, + framework::Tensor* output) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector.numel(), size); + PADDLE_ENFORCE_EQ(output->dims(), in_dims); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenMatrix::From(vector); + auto out = framework::EigenMatrix::From(*output); + Eigen::array shape({{1, static_cast(size)}}); + Eigen::array bcast({{static_cast(in_dims[0]), 1}}); + out.device(*context.GetEigenDevice()) = + in + vec.reshape(shape).broadcast(bcast); +} + +template +void ColwiseSum::operator()(const platform::DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* vector) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector->numel(), size); + + auto vec = framework::EigenMatrix::From(*vector); + auto in = framework::EigenMatrix::From(input); + Eigen::array shape({{1, static_cast(size)}}); + vec.reshape(shape).device(*context.GetEigenDevice()) = + in.sum(Eigen::array({{0}})).reshape(shape); +} + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu index 8d04653832d58aa048f73e53b8349a08da3145a4..c5d968aeb216bbb3e0e17f138b9e891494d99f75 100644 --- a/paddle/operators/math/sequence2batch.cu +++ b/paddle/operators/math/sequence2batch.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#define EIGEN_USE_GPU #include "paddle/operators/math/sequence2batch.h" namespace paddle { diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h index 794c7d43973924d470124baf8c0c3de66e4ba087..73295ddbcb73fe80be08e732790f0ec75e94b415 100644 --- a/paddle/operators/math/sequence2batch.h +++ b/paddle/operators/math/sequence2batch.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/framework/eigen.h" #include "paddle/framework/lod_tensor.h" #include "paddle/framework/tensor.h" #include "paddle/platform/device_context.h" @@ -21,6 +22,10 @@ namespace paddle { namespace operators { namespace math { +template +using EigenMatrix = framework::EigenMatrix; + template class CopyMatrixRowsFunctor { public: diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc index 0ba8197ab8b64649c8adcf67771ba01eca7f1d10..3e2f15d6c27f58818128f32fab0bd4c5f36b0050 100644 --- a/paddle/operators/math/softmax.cc +++ b/paddle/operators/math/softmax.cc @@ -13,13 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/math/softmax.h" +#include "paddle/operators/math/softmax_impl.h" namespace paddle { namespace operators { namespace math { template class SoftmaxFunctor; +template class SoftmaxFunctor; template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/softmax.cu b/paddle/operators/math/softmax.cu index 99f988d51e4b16c3f3bfd9c76b411bb53619603e..4dbab51d46bdaaa506a6c242d0958c73687f4eb9 100644 --- a/paddle/operators/math/softmax.cu +++ b/paddle/operators/math/softmax.cu @@ -15,13 +15,16 @@ limitations under the License. */ #define EIGEN_USE_GPU #include "paddle/operators/math/softmax.h" +#include "paddle/operators/math/softmax_impl.h" namespace paddle { namespace operators { namespace math { template class SoftmaxFunctor; +template class SoftmaxFunctor; template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h index b7f627eee7f8fe68a83595a3390a55d438c97afb..fe1074650234c5beb5889e7efd713164769ad740 100644 --- a/paddle/operators/math/softmax.h +++ b/paddle/operators/math/softmax.h @@ -13,60 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" #include "paddle/framework/tensor.h" namespace paddle { namespace operators { namespace math { -template -using EigenMatrix = framework::EigenMatrix; - -template -struct ValueClip { - HOSTDEVICE T operator()(const T& x) const { - const T kThreshold = -64.; - return x < kThreshold ? kThreshold : x; - } -}; - template class SoftmaxFunctor { public: void operator()(const platform::DeviceContext& context, - const framework::Tensor* X, framework::Tensor* Y) { - auto logits = EigenMatrix::From(*X); - auto softmax = EigenMatrix::From(*Y); - - const int kBatchDim = 0; - const int kClassDim = 1; - - const int batch_size = logits.dimension(kBatchDim); - const int num_classes = logits.dimension(kClassDim); - - Eigen::DSizes along_class(kClassDim); - Eigen::DSizes batch_by_one(batch_size, 1); - Eigen::DSizes one_by_class(1, num_classes); - - auto shifted_logits = (logits - - logits.maximum(along_class) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)) - .unaryExpr(ValueClip()); - - softmax.device(*context.GetEigenDevice()) = shifted_logits.exp(); - softmax.device(*context.GetEigenDevice()) = - (softmax * - softmax.sum(along_class) - .inverse() - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)); - } + const framework::Tensor* X, framework::Tensor* Y); }; template @@ -74,29 +31,7 @@ class SoftmaxGradFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor* y, const framework::Tensor* y_grad, - framework::Tensor* x_grad) { - auto softmax = EigenMatrix::From(*y); - auto softmax_grad = EigenMatrix::From(*y_grad); - auto logits_grad = EigenMatrix::From(*x_grad); - - const int kBatchDim = 0; - const int kClassDim = 1; - - const int batch_size = softmax.dimension(kBatchDim); - const int num_classes = softmax.dimension(kClassDim); - - Eigen::DSizes along_class(kClassDim); - Eigen::DSizes batch_by_one(batch_size, 1); - Eigen::DSizes one_by_class(1, num_classes); - - auto dot = (softmax * softmax_grad) - .sum(along_class) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class); - logits_grad.device(*context.GetEigenDevice()) = - (softmax_grad - dot) * softmax; - } + framework::Tensor* x_grad); }; } // namespace math diff --git a/paddle/operators/math/softmax_impl.h b/paddle/operators/math/softmax_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..05793eeb3eeafaf36c301236197555b7b35e5803 --- /dev/null +++ b/paddle/operators/math/softmax_impl.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/eigen.h" +#include "paddle/framework/tensor.h" + +namespace paddle { +namespace operators { +namespace math { + +template +using EigenMatrix = framework::EigenMatrix; + +template +struct ValueClip { + HOSTDEVICE T operator()(const T& x) const { + const T kThreshold = -64.; + return x < kThreshold ? kThreshold : x; + } +}; + +template +void SoftmaxFunctor::operator()( + const platform::DeviceContext& context, const framework::Tensor* X, + framework::Tensor* Y) { + auto logits = EigenMatrix::From(*X); + auto softmax = EigenMatrix::From(*Y); + + const int kBatchDim = 0; + const int kClassDim = 1; + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + + auto shifted_logits = (logits - + logits.maximum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)) + .unaryExpr(ValueClip()); + + softmax.device(*context.GetEigenDevice()) = shifted_logits.exp(); + softmax.device(*context.GetEigenDevice()) = + (softmax * + softmax.sum(along_class) + .inverse() + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)); +} + +template +void SoftmaxGradFunctor::operator()( + const platform::DeviceContext& context, const framework::Tensor* y, + const framework::Tensor* y_grad, framework::Tensor* x_grad) { + auto softmax = EigenMatrix::From(*y); + auto softmax_grad = EigenMatrix::From(*y_grad); + auto logits_grad = EigenMatrix::From(*x_grad); + + const int kBatchDim = 0; + const int kClassDim = 1; + + const int batch_size = softmax.dimension(kBatchDim); + const int num_classes = softmax.dimension(kClassDim); + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + + auto dot = (softmax * softmax_grad) + .sum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class); + logits_grad.device(*context.GetEigenDevice()) = + (softmax_grad - dot) * softmax; +} + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/matmul_op.cu b/paddle/operators/matmul_op.cu.cc similarity index 100% rename from paddle/operators/matmul_op.cu rename to paddle/operators/matmul_op.cu.cc diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h index 4f565946d596b5e5fbf90f16c0c13c780c36886c..1e4aa48b7018d8e3d6f02591fbca2877ddbd3c5d 100644 --- a/paddle/operators/matmul_op.h +++ b/paddle/operators/matmul_op.h @@ -15,8 +15,8 @@ #pragma once #include "paddle/framework/op_registry.h" +#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/matmul.h" -#include "paddle/operators/transpose_op.h" namespace paddle { namespace operators { @@ -76,7 +76,10 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context, if (in_dims.size() == 3) { output.Resize({in_dims[1], in_dims[0], in_dims[2]}); output.mutable_data(context.GetPlace()); - EigenTranspose(context, input, output, {1, 0, 2}); + std::vector axis = {1, 0, 2}; + math::Transpose trans; + trans(context.device_context(), input, &output, axis); + std::vector out_dims = {in_dims[1], in_dims[0] * in_dims[2]}; output.Resize({in_dims[1], in_dims[0] * in_dims[2]}); } else { output.ShareDataWith(input); diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu.cc similarity index 100% rename from paddle/operators/mul_op.cu rename to paddle/operators/mul_op.cu.cc diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu.cc similarity index 100% rename from paddle/operators/nccl_op.cu rename to paddle/operators/nccl_op.cu.cc diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu.cc similarity index 100% rename from paddle/operators/nccl_op_test.cu rename to paddle/operators/nccl_op_test.cu.cc diff --git a/paddle/operators/pool_cudnn_op.cu b/paddle/operators/pool_cudnn_op.cu.cc similarity index 100% rename from paddle/operators/pool_cudnn_op.cu rename to paddle/operators/pool_cudnn_op.cu.cc diff --git a/paddle/operators/pool_op.cu b/paddle/operators/pool_op.cu.cc similarity index 100% rename from paddle/operators/pool_op.cu rename to paddle/operators/pool_op.cu.cc diff --git a/paddle/operators/pool_with_index_op.cu b/paddle/operators/pool_with_index_op.cu.cc similarity index 100% rename from paddle/operators/pool_with_index_op.cu rename to paddle/operators/pool_with_index_op.cu.cc diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h index c0e3b117dc3ea351b9edfed4d1823de0db27d30a..a081607edce335f0265388ab01238d584bcf3ead 100644 --- a/paddle/operators/pool_with_index_op.h +++ b/paddle/operators/pool_with_index_op.h @@ -81,22 +81,21 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel { if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); - auto temp = framework::EigenVector::Flatten(*in_x_grad); - temp.device(context.GetEigenDevice()) = - temp.constant(static_cast(0)); + auto& device_ctx = context.device_context(); + math::set_constant(device_ctx, in_x_grad, 0); switch (ksize.size()) { case 2: { paddle::operators::math::MaxPool2dWithIndexGradFunctor pool2d_backward; - pool2d_backward(context.device_context(), *out_grad, *mask, ksize, - strides, paddings, in_x_grad); + pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides, + paddings, in_x_grad); } break; case 3: { paddle::operators::math::MaxPool3dWithIndexGradFunctor pool3d_backward; - pool3d_backward(context.device_context(), *out_grad, *mask, ksize, - strides, paddings, in_x_grad); + pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, + paddings, in_x_grad); } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } } diff --git a/paddle/operators/reshape_op.cu b/paddle/operators/reshape_op.cu.cc similarity index 100% rename from paddle/operators/reshape_op.cu rename to paddle/operators/reshape_op.cu.cc diff --git a/paddle/operators/sequence_concat_op.cu b/paddle/operators/sequence_concat_op.cu.cc similarity index 100% rename from paddle/operators/sequence_concat_op.cu rename to paddle/operators/sequence_concat_op.cu.cc diff --git a/paddle/operators/sequence_conv_op.cu b/paddle/operators/sequence_conv_op.cu.cc similarity index 97% rename from paddle/operators/sequence_conv_op.cu rename to paddle/operators/sequence_conv_op.cu.cc index 4c0c673a517c4b05c3abd8bf6b5cf5bbb19cfae0..6106b0e46c0ab96e01dfc344055f23dbf4a1a2c3 100644 --- a/paddle/operators/sequence_conv_op.cu +++ b/paddle/operators/sequence_conv_op.cu.cc @@ -12,8 +12,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -#define EIGEN_USE_GPU - #include "paddle/operators/sequence_conv_op.h" namespace ops = paddle::operators; diff --git a/paddle/operators/sequence_conv_op.h b/paddle/operators/sequence_conv_op.h index adee8d760e1d6bed852236ecee0951656c458901..b8fbe2647c4338a2fa16aa655ebab64dd8d5417d 100644 --- a/paddle/operators/sequence_conv_op.h +++ b/paddle/operators/sequence_conv_op.h @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" #include "paddle/operators/math/context_project.h" #include "paddle/operators/math/math_function.h" diff --git a/paddle/operators/sequence_softmax_op.cu b/paddle/operators/sequence_softmax_op.cu.cc similarity index 100% rename from paddle/operators/sequence_softmax_op.cu rename to paddle/operators/sequence_softmax_op.cu.cc diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu index 2f41c7fc121950926f6e8d842eb629d59738f321..7b6c5ec30628b521b594ceaa3b7f1e0e03e497e4 100644 --- a/paddle/operators/sgd_op.cu +++ b/paddle/operators/sgd_op.cu @@ -20,11 +20,11 @@ namespace paddle { namespace operators { namespace { -template +template __global__ void SparseSGDFunctorKernel(const T* selected_rows, const int64_t* rows, const T* learning_rate, T* tensor_out, - int64_t row_numel, int block_size) { + int64_t row_numel) { const int ty = blockIdx.y; int tid = threadIdx.x; @@ -59,14 +59,15 @@ struct SparseSGDFunctor { auto* in_data = in_value.data(); auto* out_data = output->data(); - int block_size = 256; + const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, in_rows.size()); SparseSGDFunctorKernel< - T><<(context) - .stream()>>>(in_data, in_rows.data(), learning_rate.data(), - out_data, in_row_numel, block_size); + T, 256><<(context) + .stream()>>>(in_data, in_rows.data(), + learning_rate.data(), out_data, + in_row_numel); } }; diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu.cc similarity index 100% rename from paddle/operators/softmax_op.cu rename to paddle/operators/softmax_op.cu.cc diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc index ed96e8cee5a78e63ea29ed383d06c1258abdc328..3dbb62d2e571eb92025c1b3fc0a6653c7cda007a 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/operators/softmax_with_cross_entropy_op.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/operators/softmax_with_cross_entropy_op.h" #include -#include namespace paddle { namespace operators { diff --git a/paddle/operators/split_op.cu b/paddle/operators/split_op.cu.cc similarity index 100% rename from paddle/operators/split_op.cu rename to paddle/operators/split_op.cu.cc diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc index 57b99bdb3a9359bbfdbe62a6fc9afca6c4d5df9e..c2b7632b2865a3ef66051d815d7722a08c6a8cbd 100644 --- a/paddle/operators/sum_op.cc +++ b/paddle/operators/sum_op.cc @@ -12,7 +12,7 @@ limitations under the License. */ #include "paddle/operators/sum_op.h" #include #include "paddle/framework/var_type_inference.h" -#include "paddle/operators/net_op.h" +#include "paddle/operators/detail/safe_ref.h" namespace paddle { namespace operators { @@ -60,13 +60,16 @@ class SumOp : public framework::OperatorWithKernel { x_vars[0]->Get().value().type()), ctx.device_context()); } else if (x_vars[0]->IsType()) { - auto& array = x_vars[0]->Get(); - for (auto& each : array) { - if (each.numel() != 0) { - return framework::OpKernelType(framework::ToDataType(each.type()), - ctx.device_context()); + for (auto& x_var : x_vars) { + auto& array = x_var->Get(); + for (auto& each : array) { + if (each.numel() != 0) { + return framework::OpKernelType(framework::ToDataType(each.type()), + ctx.device_context()); + } } } + PADDLE_THROW("Cannot find the input data type by all input data"); } PADDLE_THROW("Unexpected branch. Input type is %s", x_vars[0]->Type().name()); @@ -97,6 +100,11 @@ class SumOpVarTypeInference : public framework::VarTypeInference { auto& inputs = op_desc.Input("X"); auto var_type = framework::VarDesc::SELECTED_ROWS; + for (auto& name : op_desc.Input("X")) { + VLOG(10) << name << " " + << block->FindRecursiveOrCreateVar(name)->GetType(); + } + bool any_input_is_lod_tensor = std::any_of( inputs.begin(), inputs.end(), [block](const std::string& name) { return block->FindRecursiveOrCreateVar(name)->GetType() == @@ -104,7 +112,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference { }); auto is_tensor_array = [block](const std::string& name) { - return block->FindRecursiveOrCreateVar(name)->GetType() == + return detail::Ref(block->FindRecursiveOrCreateVar(name)).GetType() == framework::VarDesc::LOD_TENSOR_ARRAY; }; @@ -114,14 +122,26 @@ class SumOpVarTypeInference : public framework::VarTypeInference { std::all_of(inputs.begin(), inputs.end(), is_tensor_array); if (any_input_is_tensor_array) { - PADDLE_ENFORCE(all_inputs_are_tensor_array); + if (!all_inputs_are_tensor_array) { + std::ostringstream os; + for (auto& each : inputs) { + os << " " << each << " type is " + << detail::Ref(block->FindRecursiveOrCreateVar(each)).GetType() + << "\n"; + } + PADDLE_ENFORCE(all_inputs_are_tensor_array, + "Not all inputs are tensor array:\n%s", os.str()); + } var_type = framework::VarDesc::LOD_TENSOR_ARRAY; } else if (any_input_is_lod_tensor) { var_type = framework::VarDesc::LOD_TENSOR; } auto out_var_name = op_desc.Output("Out").front(); - block->FindRecursiveOrCreateVar(out_var_name)->SetType(var_type); + auto& out_var = detail::Ref(block->FindRecursiveOrCreateVar(out_var_name)); + out_var.SetType(var_type); + auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front())); + out_var.SetDataType(in_var.GetDataType()); } }; diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index 62e15604c47f25c458abc69ecd1cabf964de39bb..ae1b48d7a8e3d573a5134a822a2ed5ef70511077 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/array_operator.h" - +#include "paddle/operators/detail/safe_ref.h" namespace paddle { namespace operators { @@ -33,6 +33,8 @@ class WriteToArrayOp : public ArrayOp { auto *out = scope.FindVar(Output("Out"))->GetMutable(); if (offset >= out->size()) { + VLOG(10) << "Resize " << Output("Out") << " from " << out->size() + << " to " << offset + 1; out->resize(offset + 1); } auto *out_tensor = &out->at(offset); @@ -85,11 +87,15 @@ class WriteToArrayInferVarType : public framework::VarTypeInference { public: void operator()(const framework::OpDescBind &op_desc, framework::BlockDescBind *block) const override { - for (auto &out_var : op_desc.OutputArgumentNames()) { - VLOG(10) << "Set Variable " << out_var << " as LOD_TENSOR_ARRAY"; - block->FindRecursiveOrCreateVar(out_var)->SetType( - framework::VarDesc::LOD_TENSOR_ARRAY); - } + auto x_name = op_desc.Input("X")[0]; + auto out_name = op_desc.Output("Out")[0]; + VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY"; + auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name), + "Cannot found %s", out_name); + out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY); + auto &x = + detail::Ref(block->FindVarRecursive(x_name), "Cannot found %s", x_name); + out.SetDataType(x.GetDataType()); } }; @@ -107,11 +113,11 @@ class ReadFromArrayOp : public ArrayOp { auto &x_array = x->Get(); auto *out = scope.FindVar(Output("Out")); PADDLE_ENFORCE(out != nullptr, "Out must be set"); - auto *out_tesnor = out->GetMutable(); + auto *out_tensor = out->GetMutable(); size_t offset = GetOffset(scope, dev_ctx); PADDLE_ENFORCE_LT(offset, x_array.size()); - out_tesnor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx); - out_tesnor->set_lod(x_array[offset].lod()); + out_tensor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx); + out_tensor->set_lod(x_array[offset].lod()); } }; diff --git a/paddle/operators/transpose_op.cu b/paddle/operators/transpose_op.cu.cc similarity index 100% rename from paddle/operators/transpose_op.cu rename to paddle/operators/transpose_op.cu.cc diff --git a/paddle/operators/transpose_op.h b/paddle/operators/transpose_op.h index aaa3f47ab5545accd4d1108e0ad6f5a3062186d0..e296032f4147f9f8338148f9e4fef100c7cf816f 100644 --- a/paddle/operators/transpose_op.h +++ b/paddle/operators/transpose_op.h @@ -14,27 +14,44 @@ #pragma once -#include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/math/math_function.h" namespace paddle { namespace operators { -template -void EigenTranspose(const framework::ExecutionContext& context, - const framework::Tensor& in, framework::Tensor& out, - std::vector axis) { - Eigen::array permute; - for (int i = 0; i < Rank; i++) { - permute[i] = axis[i]; +template +inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx, + const framework::Tensor& in, framework::Tensor* out, + const std::vector& axis) { + switch (dim) { + case 1: + math::Transpose trans1; + trans1(dev_ctx, in, out, axis); + break; + case 2: + math::Transpose trans2; + trans2(dev_ctx, in, out, axis); + break; + case 3: + math::Transpose trans3; + trans3(dev_ctx, in, out, axis); + break; + case 4: + math::Transpose trans4; + trans4(dev_ctx, in, out, axis); + break; + case 5: + math::Transpose trans5; + trans5(dev_ctx, in, out, axis); + break; + case 6: + math::Transpose trans6; + trans6(dev_ctx, in, out, axis); + break; + default: + PADDLE_THROW("Tensors with rank at most 6 are supported"); } - auto in_dim = in.dims(); - auto out_dim = out.dims(); - - auto eigen_in = framework::EigenTensor::From(in); - auto eigen_out = framework::EigenTensor::From(out); - auto& dev = context.GetEigenDevice(); - eigen_out.device(dev) = eigen_in.shuffle(permute); } template @@ -47,28 +64,8 @@ class TransposeKernel : public framework::OpKernel { std::vector axis = context.Attr>("axis"); int ndims = axis.size(); - switch (ndims) { - case 1: - EigenTranspose(context, *x, *out, axis); - break; - case 2: - EigenTranspose(context, *x, *out, axis); - break; - case 3: - EigenTranspose(context, *x, *out, axis); - break; - case 4: - EigenTranspose(context, *x, *out, axis); - break; - case 5: - EigenTranspose(context, *x, *out, axis); - break; - case 6: - EigenTranspose(context, *x, *out, axis); - break; - default: - PADDLE_THROW("Tensors with rank at most 6 are supported"); - } + auto& dev_ctx = context.device_context(); + TransCompute(ndims, dev_ctx, *x, out, axis); } }; @@ -80,47 +77,19 @@ class TransposeGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Out")); auto* x_grad = context.Output(framework::GradVarName("X")); - if (x_grad) { - x_grad->mutable_data(context.GetPlace()); - - std::vector axis = context.Attr>("axis"); - std::vector reversed_axis(axis); + if (!x_grad) return; - for (size_t i = 0; i < axis.size(); i++) { - reversed_axis[axis[i]] = i; - } - - int ndims = axis.size(); + x_grad->mutable_data(context.GetPlace()); + std::vector axis = context.Attr>("axis"); + std::vector reversed_axis(axis); - switch (ndims) { - case 1: - EigenTranspose(context, *out_grad, *x_grad, - reversed_axis); - break; - case 2: - EigenTranspose(context, *out_grad, *x_grad, - reversed_axis); - break; - case 3: - EigenTranspose(context, *out_grad, *x_grad, - reversed_axis); - break; - case 4: - EigenTranspose(context, *out_grad, *x_grad, - reversed_axis); - break; - case 5: - EigenTranspose(context, *out_grad, *x_grad, - reversed_axis); - break; - case 6: - EigenTranspose(context, *out_grad, *x_grad, - reversed_axis); - break; - default: - PADDLE_THROW("Tensors with rank at most 6 are supported"); - } + for (size_t i = 0; i < axis.size(); i++) { + reversed_axis[axis[i]] = i; } + + int ndims = axis.size(); + auto& dev_ctx = context.device_context(); + TransCompute(ndims, dev_ctx, *out_grad, x_grad, reversed_axis); } }; diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 4ca6c8507a48507fd29a9c9acae2bdf36ed936ee..dcc59f5ff2ae3a8ca999d72a20cfd5c759987d89 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -14,8 +14,10 @@ #include #include "paddle/framework/executor.h" +#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" +#include "paddle/operators/detail/safe_ref.h" namespace paddle { namespace operators { @@ -26,8 +28,9 @@ using LoDTensor = framework::LoDTensor; constexpr char kStepBlock[] = "step_block"; constexpr char kCondition[] = "Condition"; constexpr char kStepScopes[] = "StepScopes"; -constexpr char kParamGrads[] = "X@Grad"; constexpr char kParameters[] = "X"; +constexpr char kParamGrads[] = "X@GRAD"; +constexpr char kOutputs[] = "Out"; class WhileOp : public framework::OperatorBase { public: @@ -71,9 +74,9 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { kCondition, "(Bool) An scalar. When it's False, the While Op will be terminated.") .AsDuplicable(); - AddOutput("Out", + AddOutput(kOutputs, "A set of variables, which will be assigned with values " - "generated by perators inside the block of While Op.") + "generated by the operators inside the block of While Op.") .AsDuplicable(); AddOutput(kStepScopes, "(StepScopeVar) A vector of local scope, which size equals the " @@ -104,17 +107,64 @@ class WhileGradOp : public framework::OperatorBase { auto *step_scopes = scope.FindVar(Input(kStepScopes))->GetMutable(); + auto outside_og_names = Inputs(framework::GradVarName(kOutputs)); + auto inside_og_names = + Attr>("original_output_grad"); + + PADDLE_ENFORCE_EQ(outside_og_names.size(), inside_og_names.size()); + for (auto cur_scope_iter = step_scopes->rbegin(); cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) { + VLOG(3) << "Start backward at time_step " + << cur_scope_iter - step_scopes->rbegin(); + framework::Scope &cur_scope = **cur_scope_iter; + // Link OG from outside to inside + for (size_t i = 0; i < outside_og_names.size(); ++i) { + auto outside_og_name = outside_og_names[i]; + auto inside_og_name = inside_og_names[i]; + VLOG(10) << "Linking outside " << outside_og_name << " --> inside " + << inside_og_name; + auto &og_outside = detail::Ref(scope.FindVar(outside_og_name)); + auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name)); + if (og_outside.Type().hash_code() == + typeid(framework::LoDTensor).hash_code()) { + auto &outside_tensor = og_outside.Get(); + auto &inside_tensor = + detail::Ref(og_inside.GetMutable()); + inside_tensor.set_lod(outside_tensor.lod()); + inside_tensor.ShareDataWith(outside_tensor); + } else if (og_outside.Type().hash_code() == + typeid(framework::LoDTensorArray).hash_code()) { + auto &outside_array = og_outside.Get(); + auto &inside_array = + detail::Ref(og_inside.GetMutable()); + VLOG(10) << outside_og_name << " size = " << outside_array.size(); + inside_array.resize(outside_array.size()); + + for (size_t j = 0; j < inside_array.size(); ++j) { + VLOG(10) << j << " " << outside_array[j].numel(); + if (outside_array[j].numel() != 0) { + inside_array[j].set_lod(outside_array[j].lod()); + inside_array[j].ShareDataWith(outside_array[j]); + } else { + PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0); + } + } + } + } + executor.Run(*program, *cur_scope_iter, block->ID(), false); auto &pg_names = Outputs(kParamGrads); auto &p_names = Inputs(kParameters); PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size()); - for (size_t prog_id = 0; prog_id < pg_names.size(); ++prog_id) { - auto inside_grad_name = framework::GradVarName(p_names[prog_id]); + for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) { + if (pg_names[param_id] == framework::kEmptyVarName) { + continue; // iterator doesn't have gradient + } + auto inside_grad_name = framework::GradVarName(p_names[param_id]); - // // TODO(tonyyang-savil: Not sure we need the following + // // TODO(tonyyang-svail): Not sure we need the following // // If does not compute gradient of that variable inside rnn, // just // // continue @@ -126,7 +176,7 @@ class WhileGradOp : public framework::OperatorBase { // zero gradient variable in step 0 if (cur_scope_iter == step_scopes->rbegin()) { auto *var = (*cur_scope_iter)->FindVar(inside_grad_name); - PADDLE_ENFORCE_NOT_NULL(var); + PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name); if (var->IsType()) { auto &inside_tensor = var->Get(); framework::AttributeMap attrs; @@ -135,27 +185,18 @@ class WhileGradOp : public framework::OperatorBase { attrs["value"] = 0.0f; auto zero_op = framework::OpRegistry::CreateOp( - "fill_constant", {}, {{"Out", {pg_names[prog_id]}}}, attrs); + "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs); zero_op->Run(scope, dev_ctx); } } // sum gradient - auto *outside_var = scope.FindVar(pg_names[prog_id]); - PADDLE_ENFORCE_NOT_NULL(outside_var); - auto &outside_tensor = *outside_var->GetMutable(); - - std::string result_var_name; - auto *local_result_var = (*cur_scope_iter)->Var(&result_var_name); - auto &local_result_tensor = - *local_result_var->GetMutable(); - - local_result_tensor.ShareDataWith(outside_tensor); - + auto new_inside_name = cur_scope.Rename(inside_grad_name); auto sum_op = framework::OpRegistry::CreateOp( - "sum", {{"X", {result_var_name, inside_grad_name}}}, - {{"Out", {result_var_name}}}, {}); - sum_op->Run(**cur_scope_iter, dev_ctx); + "sum", {{"X", {pg_names[param_id], new_inside_name}}}, + {{"Out", {pg_names[param_id]}}}, {}); + sum_op->Run(cur_scope, dev_ctx); + cur_scope.Rename(new_inside_name, inside_grad_name); } } } @@ -169,29 +210,110 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { virtual std::unique_ptr Apply() const { auto *grad = new framework::OpDescBind(); grad->SetType("while_grad"); - for (auto &input_param : this->InputNames()) { - grad->SetInput(input_param, this->Input(input_param)); - grad->SetOutput(framework::GradVarName(input_param), - this->InputGrad(input_param)); + grad->SetInput(kParameters, Input(kParameters)); + grad->SetOutput( + framework::GradVarName(kParameters), + InputGrad(kParameters, /*do not drop empty gradient*/ false)); + grad->SetInput(kOutputs, Output(kOutputs)); + + // OG should be re-calculated by step blocks, since many outputs of while op + // do not need to calculate gradients. + std::unordered_set block_ins; + { + for (auto &p : Input(kParameters)) { + block_ins.insert(p); + } + for (auto &o : Output(kOutputs)) { + block_ins.insert(o); + } } + std::unordered_set extra_inputs; + for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) { + for (auto &input_name : grad_block_[0]->Op(i)->InputArgumentNames()) { + if (block_ins.find(input_name) != block_ins.end()) { + continue; + } + extra_inputs.insert(input_name); + } - for (auto &output_param : this->OutputNames()) { - grad->SetInput(output_param, this->Output(output_param)); - if (output_param != kStepScopes) { - grad->SetInput(framework::GradVarName(output_param), - this->OutputGrad(output_param)); + for (auto &output_name : grad_block_[0]->Op(i)->OutputArgumentNames()) { + block_ins.insert(output_name); } } + + std::vector extra_inputs_list; + extra_inputs_list.resize(extra_inputs.size()); + std::copy(extra_inputs.begin(), extra_inputs.end(), + extra_inputs_list.begin()); + grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list); + grad->SetInput(kStepScopes, Output(kStepScopes)); grad->SetAttrMap(this->Attrs()); grad->SetBlockAttr(kStepBlock, *grad_block_[0]); + // record the original output gradient names, since the gradient name of + // while operator could be renamed. + grad->SetAttr("original_output_grad", extra_inputs_list); return std::unique_ptr(grad); } }; +class WhileGradOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDescBind &op_desc, + framework::BlockDescBind *block) const override { + auto p_names = op_desc.Input(kParameters); + auto pg_names = op_desc.Output(framework::GradVarName(kParameters)); + + for (size_t i = 0; i < p_names.size(); ++i) { + auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i])); + auto *g_var = block->FindVarRecursive(pg_names[i]); + if (g_var != nullptr) { // Gradient could be @EMPTY@ + VLOG(5) << "Setting " << pg_names[i] << " following " << p_names[i] + << " type: " << p_var.GetType(); + g_var->SetType(p_var.GetType()); + g_var->SetDataType(p_var.GetDataType()); + } + } + } +}; + +class WhileGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + ctx->HasInputs(kParameters); + ctx->HasOutputs(framework::GradVarName(kParameters)); + ctx->HasInputs(kOutputs); + ctx->HasInputs(framework::GradVarName(kOutputs)); + + auto p_names = ctx->Inputs(kParameters); + auto pg_names = ctx->Outputs(kParamGrads); + auto dims = ctx->GetInputsDim(kParameters); + auto var_types = ctx->GetInputsVarType(kParameters); + std::vector names_to_set; + std::vector dims_to_set; + for (size_t i = 0; i < p_names.size(); ++i) { + if (pg_names[i] == framework::kEmptyVarName) { + continue; + } + if (var_types[i] == framework::VarDesc::LOD_TENSOR) { + names_to_set.push_back(pg_names[i]); + dims_to_set.push_back(dims[i]); + } else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) { + // not sure how to set the dim of LOD_TENSOR_ARRAY + names_to_set.push_back(pg_names[i]); + dims_to_set.push_back(dims[i]); + } + } + ctx->SetDims(names_to_set, dims_to_set); + } +}; + } // namespace operators } // namespace paddle REGISTER_OPERATOR(while, paddle::operators::WhileOp, paddle::operators::WhileOpMaker, paddle::operators::WhileGradOpDescMaker); +REGISTER_OPERATOR(while_grad, paddle::operators::WhileGradOp, + paddle::operators::WhileGradOpShapeInference, + paddle::operators::WhileGradOpVarTypeInference); diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp index 8b3be062b654a52e667626199be8c8bb4a2a96d7..1898598e49652a2829e57329bab6017304cec662 100644 --- a/paddle/parameter/ParameterUpdateFunctions.cpp +++ b/paddle/parameter/ParameterUpdateFunctions.cpp @@ -30,7 +30,7 @@ void sgdUpdateCpu(real learningRate, const real* grad, real* momentumVec) { decayRate *= learningRate; -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_USE_MKLML #pragma omp parallel for #endif for (size_t i = 0; i < size; ++i) { diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h index 6b64539b0a9a4d535a53447fbcc0e458f3ac9129..61a22d9db3e07cbe6fbca0e0b09fedcba232ff6c 100644 --- a/paddle/platform/dynload/cublas.h +++ b/paddle/platform/dynload/cublas.h @@ -62,6 +62,8 @@ extern void *cublas_dso_handle; DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasSaxpy_v2); \ + __macro(cublasDaxpy_v2); \ __macro(cublasSgemv_v2); \ __macro(cublasDgemv_v2); \ __macro(cublasSgemm_v2); \ diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md index b5fd68839ddb62e76f2fd930248d546bc093a892..f3a6f1dba7588c6b29c1dcae26ec134c1a7f937d 100644 --- a/paddle/scripts/docker/README.md +++ b/paddle/scripts/docker/README.md @@ -57,8 +57,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF" | `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. | | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. | | `WITH_TESTING` | ON | Build unit tests binaries. | -| `WITH_MKLDNN` | ON | Build with [Intel® MKL DNN](https://github.com/01org/mkl-dnn) support. | -| `WITH_MKLML` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) support. | +| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. | | `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. | | `WITH_SWIG_PY` | ON | Build with SWIG python API support. | | `WITH_C_API` | OFF | Build capi libraries for inference. | diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index e9c89eee1af1fcc4a7f168af5ec8b16912616687..595d25fd4830b6e69b9a1080803771b0464741db 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -34,8 +34,7 @@ function cmake_gen() { ${PYTHON_FLAGS} -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU:-OFF} - -DWITH_MKLDNN=${WITH_MKLDNN:-ON} - -DWITH_MKLML=${WITH_MKLML:-ON} + -DWITH_MKL=${WITH_MKL:-ON} -DWITH_AVX=${WITH_AVX:-OFF} -DWITH_GOLANG=${WITH_GOLANG:-ON} -DWITH_SWIG_PY=ON @@ -56,8 +55,7 @@ EOF ${PYTHON_FLAGS} \ -DWITH_DOC=OFF \ -DWITH_GPU=${WITH_GPU:-OFF} \ - -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \ - -DWITH_MKLML=${WITH_MKLML:-ON} \ + -DWITH_MKL=${WITH_MKL:-ON} \ -DWITH_AVX=${WITH_AVX:-OFF} \ -DWITH_GOLANG=${WITH_GOLANG:-ON} \ -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \ diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in index 5c4b5a2495182ea5d2b3341cff650dfb4d8b0c0f..d71cb84df3785008ea5793519fc26a174e1b95f7 100755 --- a/paddle/scripts/submit_local.sh.in +++ b/paddle/scripts/submit_local.sh.in @@ -18,8 +18,8 @@ function version(){ echo "PaddlePaddle @PADDLE_VERSION@, compiled with" echo " with_avx: @WITH_AVX@" echo " with_gpu: @WITH_GPU@" + echo " with_mkl: @WITH_MKL@" echo " with_mkldnn: @WITH_MKLDNN@" - echo " with_mklml: @WITH_MKLML@" echo " with_double: @WITH_DOUBLE@" echo " with_python: @WITH_PYTHON@" echo " with_rdma: @WITH_RDMA@" @@ -43,6 +43,54 @@ function ver2num() { set +e } +function cpu_config() { + # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status + # only when MKL enabled + if [ "@WITH_MKL@" == "OFF" ]; then + return 0 + fi + ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs` + if [ $ht -eq 1 ]; then # HT is OFF + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,0,0" + fi + if [ -z "$OMP_DYNAMIC" ]; then + export OMP_DYNAMIC="FALSE" + fi + else # HT is ON + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,1,0" + fi + if [ -z "$OMP_DYNAMIC" ]; then + export OMP_DYNAMIC="True" + fi + fi +} + +function threads_config() { + # auto set OMP_NUM_THREADS and MKL_NUM_THREADS + # according to trainer_count and total processors + # only when MKL enabled + if [ "@WITH_MKL@" == "OFF" ]; then + return 0 + fi + processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l` + trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs` + if [ -z $trainers ]; then + trainers=1 + fi + threads=$((processors / trainers)) + if [ $threads -eq 0 ]; then + threads=1 + fi + if [ -z "$OMP_NUM_THREADS" ]; then + export OMP_NUM_THREADS=$threads + fi + if [ -z "$MKL_NUM_THREADS" ]; then + export MKL_NUM_THREADS=$threads + fi +} + PADDLE_CONF_HOME="$HOME/.config/paddle" mkdir -p ${PADDLE_CONF_HOME} @@ -92,9 +140,13 @@ else: sys.exit(0) EOF +cpu_config +# echo $KMP_AFFINITY $OMP_DYNAMIC case "$1" in "train") + threads_config $@ + # echo $OMP_NUM_THREADS $MKL_NUM_THREADS ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2} ;; "merge_model") diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh index 973b2736e5ce2b733d52df4f5a270b296bca2cac..28d82343ed32273740d0c52d0451681e43b3675e 100755 --- a/paddle/scripts/travis/build_doc.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -6,7 +6,7 @@ mkdir -p $TRAVIS_BUILD_DIR/build cd $TRAVIS_BUILD_DIR/build # Compile Documentation only. -cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON +cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON make -j `nproc` gen_proto_py make -j `nproc` paddle_docs paddle_docs_cn diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp index b68e29cd5ea223272151e7a8b52d998832f47103..88e684849df6fbfe4042b92bdb76ef98159eecea 100644 --- a/paddle/trainer/Trainer.cpp +++ b/paddle/trainer/Trainer.cpp @@ -137,6 +137,10 @@ void Trainer::init(const std::shared_ptr& config, } } + if (FLAGS_use_mkldnn) { + CHECK_EQ(FLAGS_trainer_count, 1UL) << "MKLDNN only need 1 trainer"; + } + if (testing) { LOG(INFO) << "trainer: in testing mode"; if (config_->getOptConfig().use_sparse_remote_updater() || diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt index f01ad4142d4fe7c7f7d7aac60d967ea114b93e56..80665551ec51214d90b866f0c7b2abb2fdee5f39 100644 --- a/paddle/trainer/tests/CMakeLists.txt +++ b/paddle/trainer/tests/CMakeLists.txt @@ -28,35 +28,7 @@ if(WITH_PYTHON) ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) endif() -################ test_CompareTwoNets ###################### -add_unittest_without_exec(test_CompareTwoNets - test_CompareTwoNets.cpp) -add_test(NAME test_CompareTwoNets - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ - ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets - --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) -############### test_CompareTwoOpts ################### -add_unittest_without_exec(test_CompareTwoOpts - test_CompareTwoOpts.cpp) -add_test(NAME test_CompareTwoOpts - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ - ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts - --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf - --num_passes=1 --need_high_accuracy=0 - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) - -################# test_CompareSparse ################## -add_unittest_without_exec(test_CompareSparse - test_CompareSparse.cpp) -if(NOT ON_TRAVIS) - add_test(NAME test_CompareSparse - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ - ./.set_port.sh -p port -n 6 - ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) -endif() ################# test_recurrent_machine_generation ############### add_unittest_without_exec(test_recurrent_machine_generation test_recurrent_machine_generation.cpp) diff --git a/paddle/trainer/tests/mnist.list b/paddle/trainer/tests/mnist.list deleted file mode 100644 index 703e87753d5a4f507aad11a6d875cea44787667b..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/mnist.list +++ /dev/null @@ -1 +0,0 @@ -trainer/tests/mnist_bin_part diff --git a/paddle/trainer/tests/mnist_bin_part b/paddle/trainer/tests/mnist_bin_part deleted file mode 100644 index 08b93a0ebb5698bdafbc36c3c757918a50bab621..0000000000000000000000000000000000000000 Binary files a/paddle/trainer/tests/mnist_bin_part and /dev/null differ diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data deleted file mode 100644 index f189b21e86a50d70d317b5e43aa2d6e05af5e774..0000000000000000000000000000000000000000 Binary files a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data and /dev/null differ diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist deleted file mode 100644 index 6b406dff0ba91b5f310d7eafa111c0d21d6542c3..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist +++ /dev/null @@ -1 +0,0 @@ -./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data diff --git a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf deleted file mode 100644 index 92f32a18c0068ab4672034a270aa8c52f2716d59..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf +++ /dev/null @@ -1,154 +0,0 @@ -#edit-mode: -*- python -*- -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later. - -# Note: when making change to this file, please make sure -# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest -# for comparing these two nets can pass (test_CompareTwoNets) - -default_initial_std(0.1) -default_device(0) - -word_dim = 999 -l1 = 0 -l2 = 0 - -model_type("nn") - -sparse_update = get_config_arg("sparse_update", bool, False) - -TrainData(ProtoData( - type = "proto_sequence", - files = ('trainer/tests/train_sparse.list'), - )) - -Settings( - algorithm='sgd', - batch_size=100, - learning_rate=0.0001, - learning_rate_decay_a=4e-08, - learning_rate_decay_b=0.0, - learning_rate_schedule='poly', -) - - -wordvec_dim = 32 -layer2_dim = 16 -layer3_dim = 16 -hidden_dim = 32 - -slot_names = ["qb", "qw", "tb", "tw"] - -def ltr_network(network_name, - word_dim=word_dim, - wordvec_dim=wordvec_dim, - layer2_dim=layer2_dim, - layer3_dim=layer3_dim, - hidden_dim=hidden_dim, - slot_names=slot_names, - l1=l1, - l2=l2): - - slotnum = len(slot_names) - for i in xrange(slotnum): - Inputs(slot_names[i] + network_name) - for i in xrange(slotnum): - Layer( - name = slot_names[i] + network_name, - type = "data", - size = word_dim, - device = -1, - ) - Layer( - name = slot_names[i] + "_embedding_" + network_name, - type = "mixed", - size = wordvec_dim, - bias = False, - device = -1, - inputs = TableProjection(slot_names[i] + network_name, - parameter_name = "embedding.w0", - decay_rate_l1=l1, - sparse_remote_update = True, - sparse_update = sparse_update, - ), - ) - Layer( - name = slot_names[i] + "_rnn1_" + network_name, - type = "recurrent", - active_type = "tanh", - bias = Bias(initial_std = 0, - parameter_name = "rnn1.bias"), - inputs = Input(slot_names[i] + "_embedding_" + network_name, - parameter_name = "rnn1.w0") - ) - Layer( - name = slot_names[i] + "_rnnlast_" + network_name, - type = "seqlastins", - inputs = [ - slot_names[i] + "_rnn1_" + network_name, - ], - ) - - Layer( - name = "layer2_" + network_name, - type = "fc", - active_type = "tanh", - size = layer2_dim, - bias = Bias(parameter_name = "layer2.bias"), - inputs = [Input(slot_name + "_rnnlast_" + network_name, - parameter_name = "_layer2_" + slot_name + ".w", - decay_rate = l2, - initial_smart = True) for slot_name in slot_names] - ) - Layer( - name = "layer3_" + network_name, - type = "fc", - active_type = "tanh", - size = layer3_dim, - bias = Bias(parameter_name = "layer3.bias"), - inputs = [ - Input("layer2_" + network_name, - parameter_name = "_layer3.w", - decay_rate = l2, - initial_smart = True), - ] - ) - Layer( - name = "output_" + network_name, - type = "fc", - size = 1, - bias = False, - inputs = [ - Input("layer3_" + network_name, - parameter_name = "_layerO.w"), - ], - ) - - -ltr_network("left") -ltr_network("right") -Inputs("label") -Layer( - name = "label", - type = "data", - size = 1, - ) -Outputs("cost", "qb_rnnlast_left") -Layer( - name = "cost", - type = "rank-cost", - inputs = ["output_left", "output_right", "label"], - ) diff --git a/paddle/trainer/tests/sample_trainer_config_opt_a.conf b/paddle/trainer/tests/sample_trainer_config_opt_a.conf deleted file mode 100644 index b1744db8d604c88ec47e7104f79b38bb9d0e4442..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/sample_trainer_config_opt_a.conf +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -################################### Data Configuration ################################### -TrainData(ProtoData(files = "trainer/tests/mnist.list")) -################################### Algorithm Configuration ################################### -settings(batch_size = 1000, - learning_method = MomentumOptimizer(momentum=0.5, sparse=False)) -################################### Network Configuration ################################### -data = data_layer(name ="input", size=784) - -fc1 = fc_layer(input=data, size=800, - bias_attr=True, - act=SigmoidActivation()) - -fc2 = fc_layer(input=fc1, size=800, - bias_attr=True, - act=SigmoidActivation()) - -output = fc_layer(input=[fc1, fc2], size=10, - bias_attr=True, - act=SoftmaxActivation()) - -lbl = data_layer(name ="label", size=1) - -cost = classification_cost(input=output, label=lbl) -outputs(cost) diff --git a/paddle/trainer/tests/sample_trainer_config_opt_b.conf b/paddle/trainer/tests/sample_trainer_config_opt_b.conf deleted file mode 100644 index b1744db8d604c88ec47e7104f79b38bb9d0e4442..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/sample_trainer_config_opt_b.conf +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -################################### Data Configuration ################################### -TrainData(ProtoData(files = "trainer/tests/mnist.list")) -################################### Algorithm Configuration ################################### -settings(batch_size = 1000, - learning_method = MomentumOptimizer(momentum=0.5, sparse=False)) -################################### Network Configuration ################################### -data = data_layer(name ="input", size=784) - -fc1 = fc_layer(input=data, size=800, - bias_attr=True, - act=SigmoidActivation()) - -fc2 = fc_layer(input=fc1, size=800, - bias_attr=True, - act=SigmoidActivation()) - -output = fc_layer(input=[fc1, fc2], size=10, - bias_attr=True, - act=SoftmaxActivation()) - -lbl = data_layer(name ="label", size=1) - -cost = classification_cost(input=output, label=lbl) -outputs(cost) diff --git a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf b/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf deleted file mode 100644 index d19222360c2f424ddb306b155dfef07921098a6b..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf +++ /dev/null @@ -1,154 +0,0 @@ -#edit-mode: -*- python -*- -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later. - -# Note: when making change to this file, please make sure -# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest -# for comparing these two nets can pass (test_CompareTwoNets) - -default_initial_std(0.1) -default_device(0) - -word_dim = 1451594 -l1 = 0 -l2 = 0 - -model_type("nn") - -sparse_update = get_config_arg("sparse_update", bool, False) - -TrainData(ProtoData( - type = "proto_sequence", - files = ('trainer/tests/train.list'), - )) - -Settings( - algorithm='sgd', - batch_size=100, - learning_rate=0.0001, - learning_rate_decay_a=4e-08, - learning_rate_decay_b=0.0, - learning_rate_schedule='poly', -) - - -wordvec_dim = 128 -layer2_dim = 96 -layer3_dim = 96 -hidden_dim = 128 - -slot_names = ["qb", "qw", "tb", "tw"] - -def ltr_network(network_name, - word_dim=word_dim, - wordvec_dim=wordvec_dim, - layer2_dim=layer2_dim, - layer3_dim=layer3_dim, - hidden_dim=hidden_dim, - slot_names=slot_names, - l1=l1, - l2=l2): - - slotnum = len(slot_names) - for i in xrange(slotnum): - Inputs(slot_names[i] + network_name) - for i in xrange(slotnum): - Layer( - name = slot_names[i] + network_name, - type = "data", - size = word_dim, - device = -1, - ) - Layer( - name = slot_names[i] + "_embedding_" + network_name, - type = "mixed", - size = wordvec_dim, - bias = False, - device = -1, - inputs = TableProjection(slot_names[i] + network_name, - parameter_name = "embedding.w0", - decay_rate_l1=l1, - sparse_remote_update = True, - sparse_update = sparse_update, - ), - ) - Layer( - name = slot_names[i] + "_rnn1_" + network_name, - type = "recurrent", - active_type = "tanh", - bias = Bias(initial_std = 0, - parameter_name = "rnn1.bias"), - inputs = Input(slot_names[i] + "_embedding_" + network_name, - parameter_name = "rnn1.w0") - ) - Layer( - name = slot_names[i] + "_rnnlast_" + network_name, - type = "seqlastins", - inputs = [ - slot_names[i] + "_rnn1_" + network_name, - ], - ) - - Layer( - name = "layer2_" + network_name, - type = "fc", - active_type = "tanh", - size = layer2_dim, - bias = Bias(parameter_name = "layer2.bias"), - inputs = [Input(slot_name + "_rnnlast_" + network_name, - parameter_name = "_layer2_" + slot_name + ".w", - decay_rate = l2, - initial_smart = True) for slot_name in slot_names] - ) - Layer( - name = "layer3_" + network_name, - type = "fc", - active_type = "tanh", - size = layer3_dim, - bias = Bias(parameter_name = "layer3.bias"), - inputs = [ - Input("layer2_" + network_name, - parameter_name = "_layer3.w", - decay_rate = l2, - initial_smart = True), - ] - ) - Layer( - name = "output_" + network_name, - type = "fc", - size = 1, - bias = False, - inputs = [ - Input("layer3_" + network_name, - parameter_name = "_layerO.w"), - ], - ) - - -ltr_network("left") -ltr_network("right") -Inputs("label") -Layer( - name = "label", - type = "data", - size = 1, - ) -Outputs("cost", "qb_rnnlast_left") -Layer( - name = "cost", - type = "rank-cost", - inputs = ["output_left", "output_right", "label"], - ) diff --git a/paddle/trainer/tests/sample_trainer_config_rnn.conf b/paddle/trainer/tests/sample_trainer_config_rnn.conf deleted file mode 100644 index b720d4d5a6ca59e207832a8c5410c2cb6074c439..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/sample_trainer_config_rnn.conf +++ /dev/null @@ -1,180 +0,0 @@ -#edit-mode: -*- python -*- -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later. - -# Note: when making change to this file, please make sure -# sample_trainer_config_qb_rnn.conf is changed accordingly so that the uniitest -# for comparing these two nets can pass (test_CompareTwoNets) - -default_initial_std(0.1) -default_device(0) - -word_dim = 1451594 -l1 = 0 -l2 = 0 - -model_type("recurrent_nn") - -sparse_update = get_config_arg("sparse_update", bool, False) - -TrainData(ProtoData( - type = "proto_sequence", - files = ('trainer/tests/train.list'), - )) - -Settings( - algorithm='sgd', - batch_size=100, - learning_rate=0.0001, - learning_rate_decay_a=4e-08, - learning_rate_decay_b=0.0, - learning_rate_schedule='poly', -) - - -wordvec_dim = 128 -layer2_dim = 96 -layer3_dim = 96 -hidden_dim = 128 - -slot_names = ["qb", "qw", "tb", "tw"] - -def SimpleRecurrentLayer(name, - size, - active_type, - bias, - input_layer_name, - parameter_name, - seq_reversed = False): - RecurrentLayerGroupBegin(name + "_layer_group", - in_links=[input_layer_name], - out_links=[name], - seq_reversed=seq_reversed) - memory_name = Memory(name=name, size=size) - Layer( - name = name, - type = "mixed", - size = size, - active_type = active_type, - bias = bias, - inputs = [IdentityProjection(input_layer_name), - FullMatrixProjection(memory_name, - parameter_name = parameter_name, - ), - ] - ) - RecurrentLayerGroupEnd(name + "_layer_group") - - -def ltr_network(network_name, - word_dim=word_dim, - wordvec_dim=wordvec_dim, - layer2_dim=layer2_dim, - layer3_dim=layer3_dim, - hidden_dim=hidden_dim, - slot_names=slot_names, - l1=l1, - l2=l2): - - slotnum = len(slot_names) - for i in xrange(slotnum): - Inputs(slot_names[i] + network_name) - for i in xrange(slotnum): - Layer( - name = slot_names[i] + network_name, - type = "data", - size = word_dim, - device = -1, - ) - Layer( - name = slot_names[i] + "_embedding_" + network_name, - type = "mixed", - size = wordvec_dim, - bias = False, - device = -1, - inputs = TableProjection(slot_names[i] + network_name, - parameter_name = "embedding.w0", - decay_rate_l1=l1, - sparse_remote_update = True, - sparse_update = sparse_update, - ), - ) - SimpleRecurrentLayer( - name = slot_names[i] + "_rnn1_" + network_name, - size = hidden_dim, - active_type = "tanh", - bias = Bias(initial_std = 0, - parameter_name = "rnn1.bias"), - input_layer_name = slot_names[i] + "_embedding_" + network_name, - parameter_name = "rnn1.w0", - ) - Layer( - name = slot_names[i] + "_rnnlast_" + network_name, - type = "seqlastins", - inputs = [ - slot_names[i] + "_rnn1_" + network_name, - ], - ) - Layer( - name = "layer2_" + network_name, - type = "fc", - active_type = "tanh", - size = layer2_dim, - bias = Bias(parameter_name = "layer2.bias"), - inputs = [Input(slot_name + "_rnnlast_" + network_name, - parameter_name = "_layer2_" + slot_name + ".w", - decay_rate = l2, - initial_smart = True) for slot_name in slot_names] - ) - Layer( - name = "layer3_" + network_name, - type = "fc", - active_type = "tanh", - size = layer3_dim, - bias = Bias(parameter_name = "layer3.bias"), - inputs = [ - Input("layer2_" + network_name, - parameter_name = "_layer3.w", - decay_rate = l2, - initial_smart = True), - ] - ) - Layer( - name = "output_" + network_name, - type = "fc", - size = 1, - bias = False, - inputs = [ - Input("layer3_" + network_name, - parameter_name = "_layerO.w"), - ], - ) - - -ltr_network("left") -ltr_network("right") -Inputs("label") -Layer( - name = "label", - type = "data", - size = 1, - ) -Outputs("cost", "qb_rnnlast_left") -Layer( - name = "cost", - type = "rank-cost", - inputs = ["output_left", "output_right", "label"], - ) diff --git a/paddle/trainer/tests/testPyDataWrapper.py b/paddle/trainer/tests/testPyDataWrapper.py index 2c29a274339747b78fbd6c27ae4070f0abbd4028..a76eeeacb91cdba305d2f71c6292f79e4b98dd73 100644 --- a/paddle/trainer/tests/testPyDataWrapper.py +++ b/paddle/trainer/tests/testPyDataWrapper.py @@ -20,28 +20,6 @@ import random import json import string - -@provider(slots=[ - SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1), - IndexSlot(3) -]) -def processNonSequenceData(obj, filename): - with open(filename, "rb") as f: - for line in f: - slots_str = line.split(';') - index = int(slots_str[0]) - non_values = map(int, slots_str[1].split()[1:]) - dense = map(float, slots_str[2].split()[1:]) - strs = slots_str[4].strip().split(' ', 1)[1] - - def __values_mapper__(s): - s = s.split(":") - return int(s[0]), float(s[1]) - - values = map(__values_mapper__, slots_str[3].split()[1:]) - yield [non_values, dense, values, strs, index] - - SPARSE_ID_LIMIT = 1000 SPARSE_ID_COUNT = 100 SEQUENCE_LIMIT = 50 @@ -146,8 +124,6 @@ def processSubSeqAndGenerateData(obj, name): if __name__ == "__main__": - pvd = processNonSequenceData("test.txt") - print pvd.getNextBatch(100) pvd = processSeqAndGenerateData("_") print pvd.getNextBatch(100) pvd = processSubSeqAndGenerateData("_") diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp deleted file mode 100644 index 383505f8131264844069d6f0fa13f4e0ac1f97af..0000000000000000000000000000000000000000 --- a/paddle/trainer/tests/test_CompareTwoOpts.cpp +++ /dev/null @@ -1,184 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include - -#include "paddle/trainer/Trainer.h" - -using namespace paddle; // NOLINT -using namespace std; // NOLINT - -DECLARE_int32(gpu_id); - -DECLARE_bool(local); -DECLARE_bool(use_gpu); - -DECLARE_string(config); -DECLARE_string(nics); - -DEFINE_string(config_file_a, "", "config of one network to compare"); -DEFINE_string(config_file_b, "", "config of another network to compare"); -DEFINE_bool(need_high_accuracy, - true, - "whether need to run in double accuracy (recommended)"); -DEFINE_double( - max_diff_ratio, - 0.0f, - "max diff ratio allowed for outputs and parameters (value/gradient)"); - -struct ComData { - vector outArgs; - vector parameters; -}; - -void calcGradient(ComData& data, const string configFile) { - FLAGS_config = configFile; - - FLAGS_local = true; - FLAGS_use_gpu = false; - - FLAGS_nics = ""; - - *ThreadLocalRand::getSeed() = 0; - srand(0); - - Trainer trainer; - trainer.init(TrainerConfigHelper::createFromFlagConfig(), false); - - data.parameters = trainer.getGradientMachine()->getParameters(); - trainer.getDataProvider()->setSkipShuffle(); - trainer.train(); -} - -void checkBuffer(real* A, - const char* desA, - real* B, - const char* desB, - size_t len, - size_t width = 1) { - int nNum = 0; - for (size_t i = 0; i < len; ++i) { - real diff = fabs(A[i] - B[i]); - if (diff > 0.0f && - diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_max_diff_ratio) { - nNum++; - LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i] - << " " << desB << " : " << B[i]; - } - } - EXPECT_EQ(0, nNum); - LOG(INFO) << "\n\n"; -} - -void compareGradient(ComData& comDataA, ComData& comDataB) { - vector outArgsA = comDataA.outArgs; - vector outArgsB = comDataB.outArgs; - - for (size_t i = 0; i < outArgsA.size(); ++i) { - CpuMatrix matA(outArgsA[i].value->getHeight(), - outArgsA[i].value->getWidth()); - CpuMatrix matB(outArgsB[i].value->getHeight(), - outArgsB[i].value->getWidth()); - - matA.copyFrom(*outArgsA[i].value); - matB.copyFrom(*outArgsB[i].value); - - LOG(INFO) << "\n--------------------------------" - << " Check Network Output_" << i << ":" - << " -------------------------------------\n"; - checkBuffer(matA.getData(), - "network A output", - matB.getData(), - "network B output", - matA.getElementCnt(), - matA.getWidth()); - } - - vector& parametersA = comDataA.parameters; - vector& parametersB = comDataB.parameters; - - LOG(INFO) << "\n\n--------------------------------" - << " Check Gradient Machine Parameters:" - << " -------------------------------------\n"; - for (size_t i = 0; i < parametersA.size(); ++i) { - ParameterPtr parameterA, parameterB; - parameterA = parametersA[i]; - parameterB = parametersB[i]; - - CpuVector paraA(parameterA->getSize()); - CpuVector paraB(parameterB->getSize()); - paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE)); - paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE)); - - LOG(INFO) << "\n\n----------- PARAMETER_VALUE: " << parameterA->getName() - << " ; size : " << paraA.getSize() << " ------------"; - checkBuffer(paraA.getData(), - "Network A", - paraB.getData(), - "Network B", - paraA.getSize()); - - CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT)); - CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT)); - - LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName() - << " ; size : " << gradA.getSize() << " -----------"; - checkBuffer(gradA.getData(), - "Network A", - gradB.getData(), - "Network B", - gradA.getSize()); - } -} - -TEST(Trainer, create) { - ComData dataA; - calcGradient(dataA, FLAGS_config_file_a); - LOG(INFO) << "\n\ntraining of Network A is finished\n\n"; - - ComData dataB; - calcGradient(dataB, FLAGS_config_file_b); - LOG(INFO) << "\n\ntraining of the Network B is finished\n\n"; - - compareGradient(dataA, dataB); -} - -int main(int argc, char** argv) { - paddle::initMain(argc, argv); - testing::InitGoogleTest(&argc, argv); - initPython(argc, argv); - -#ifndef PADDLE_TYPE_DOUBLE - if (FLAGS_need_high_accuracy) { - LOG(INFO) << "skip test due to it's need high accuracy"; - return 0; - } - if (FLAGS_max_diff_ratio == 0.0f) { - FLAGS_max_diff_ratio = 2e-4; - LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio - << " in low accuracy mode"; - } -#else - if (FLAGS_max_diff_ratio == 0.0f) { - FLAGS_max_diff_ratio = 2e-7; - LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio - << " in high accuracy mode"; - } -#endif - int ret = RUN_ALL_TESTS(); - return ret; -} diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp index 66ec65e340a435a7260028611828fb28845e0728..92dc8aa9ec5ce281d1950d84260c1b9555e686a7 100644 --- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp +++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp @@ -25,45 +25,9 @@ limitations under the License. */ #include #include "picojson.h" -void checkEqual(const paddle::Argument& expect, const paddle::Argument& actual); void checkValue(std::vector& arguments, picojson::array& arr); const std::string kDir = "./trainer/tests/pydata_provider_wrapper_dir/"; -TEST(PyDataProviderWrapper, NoSequenceData) { - paddle::DataConfig conf; - conf.set_type("py"); - conf.set_load_data_module(std::string("testPyDataWrapper")); - conf.set_load_data_object(std::string("processNonSequenceData")); - conf.set_async_load_data(false); - conf.clear_files(); - conf.set_files(kDir + "test_pydata_provider_wrapper.list"); - paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false)); - provider->setSkipShuffle(); - provider->reset(); - paddle::DataBatch batchFromPy; - provider->getNextBatch(100, &batchFromPy); - - paddle::DataConfig conf2; - conf2.set_type("proto"); - conf2.set_async_load_data(false); - conf2.clear_files(); - conf2.set_files(kDir + "test_pydata_provider_wrapper.protolist"); - - provider.reset(paddle::DataProvider::create(conf2, false)); - provider->setSkipShuffle(); - provider->reset(); - paddle::DataBatch batchFromProto; - provider->getNextBatch(100, &batchFromProto); - - std::vector& pyArguments = batchFromPy.getStreams(); - std::vector& protoArguments = batchFromProto.getStreams(); - EXPECT_EQ(pyArguments.size(), protoArguments.size()); - - for (size_t i = 0; i < pyArguments.size(); ++i) { - checkEqual(protoArguments[i], pyArguments[i]); - } -} - TEST(PyDataProviderWrapper, SequenceData) { paddle::DataConfig conf; conf.set_type("py"); @@ -148,66 +112,6 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -void checkEqual(const paddle::Argument& expect, - const paddle::Argument& actual) { - if (expect.value) { - EXPECT_TRUE(actual.value != nullptr); - paddle::Matrix* e = expect.value.get(); - paddle::Matrix* a = actual.value.get(); - EXPECT_EQ(e->getWidth(), a->getWidth()); - EXPECT_EQ(e->getHeight(), a->getHeight()); - if (dynamic_cast(e)) { - paddle::CpuSparseMatrix* se = dynamic_cast(e); - paddle::CpuSparseMatrix* sa = dynamic_cast(a); - EXPECT_EQ(se->getFormat(), sa->getFormat()); - EXPECT_EQ(se->getElementCnt(), sa->getElementCnt()); - size_t rowSize = se->getFormat() == paddle::SPARSE_CSC - ? se->getElementCnt() - : se->getHeight() + 1; - size_t colSize = se->getFormat() == paddle::SPARSE_CSC - ? se->getWidth() + 1 - : se->getElementCnt(); - for (size_t i = 0; i < rowSize; ++i) { - EXPECT_EQ(se->getRows()[i], sa->getRows()[i]); - } - for (size_t i = 0; i < colSize; ++i) { - EXPECT_EQ(se->getCols()[i], sa->getCols()[i]); - } - if (se->getValueType() == paddle::FLOAT_VALUE) { - EXPECT_EQ(paddle::FLOAT_VALUE, sa->getValueType()); - for (size_t i = 0; i < se->getElementCnt(); ++i) { - EXPECT_EQ(se->getValue()[i], sa->getValue()[i]); - } - } - } else if (dynamic_cast(e)) { - EXPECT_EQ(e->getElementCnt(), a->getElementCnt()); - for (size_t i = 0; i < e->getElementCnt(); ++i) { - EXPECT_EQ(e->getData()[i], a->getData()[i]); - } - } - } - - if (expect.ids) { - EXPECT_TRUE(actual.ids != nullptr); - paddle::VectorT* e = expect.ids.get(); - paddle::VectorT* a = actual.ids.get(); - EXPECT_EQ(e->getSize(), a->getSize()); - for (size_t i = 0; i < e->getSize(); ++i) { - EXPECT_EQ(e->getData()[i], a->getData()[i]); - } - } - - if (expect.strs) { - EXPECT_TRUE(actual.strs != nullptr); - std::vector* e = expect.strs.get(); - std::vector* a = actual.strs.get(); - EXPECT_EQ(e->size(), a->size()); - for (size_t i = 0; i < e->size(); ++i) { - EXPECT_EQ((*e)[i], (*a)[i]); - } - } -} - void checkValue(std::vector& arguments, picojson::array& arr) { // CHECK SLOT 0, Sparse Value. diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 42aac59d221d95c8eb9275d8e6baf460e1f4e3ed..0b523ac7e0bf5231398778ea69270c883ac112d2 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1826,7 +1826,7 @@ class FCLayer(LayerBase): self.layer_type = 'mkldnn_fc' config_assert( len(inputs) == 1, - "MkldnnFCLayer support one and only one input!") + "MKLDNNFCLayer support one and only one input!") super(FCLayer, self).__init__( name, self.layer_type, size, inputs=inputs, **xargs) for input_index in xrange(len(self.inputs)): @@ -1837,7 +1837,7 @@ class FCLayer(LayerBase): sparse = format == "csr" or format == "csc" if use_mkldnn: config_assert(not sparse, - "MkldnnFCLayer do not support sparse format yet") + "MKLDNNFCLayer do not support sparse format yet") if use_mkldnn_wgt: dims = [self.config.size, input_layer.size] if sparse: @@ -1853,7 +1853,7 @@ class FCLayer(LayerBase): @config_layer('mkldnn_fc') -class MkldnnFcLayer(FCLayer): +class MKLDNNFcLayer(FCLayer): layer_type = 'mkldnn_fc' @@ -3209,6 +3209,18 @@ class SubNestedSequenceLayer(LayerBase): self.set_layer_size(size) +@config_layer('dot_prod') +class DotProdLayer(LayerBase): + def __init__(self, name, inputs, device=None): + super(DotProdLayer, self).__init__( + name, 'dot_prod', 0, inputs, device=device) + config_assert(len(inputs) == 2, 'DotProdLayer must have 2 inputs.') + config_assert( + self.get_input_layer(0).size == self.get_input_layer(1).size, + "Two inputs should have the same size.") + self.set_layer_size(1) + + @config_layer('out_prod') class OuterProdLayer(LayerBase): def __init__(self, name, inputs, device=None): @@ -3335,7 +3347,9 @@ class CosSimLayer(LayerBase): def __init__(self, name, inputs, cos_scale=1, device=None): super(CosSimLayer, self).__init__( name, 'cos', 1, inputs=inputs, device=device) - config_assert(len(self.inputs) == 2, 'CosSimLayer must have 2 inputs') + config_assert( + len(self.inputs) == 2, + 'The CosSimLayer expects two and only two inputs.') config_assert( self.get_input_layer(0).size == self.get_input_layer(1).size, 'The two inputs of CosSimLayer must have the same dimensionality.') @@ -3349,10 +3363,10 @@ class CosSimVecMatLayer(LayerBase): name, 'cos_vm', size, inputs=inputs, device=device) self.config.cos_scale = cos_scale config_assert( - len(self.inputs) == 2, 'CosSimVecMatLayer must have 2 inputs') + len(self.inputs) == 2, 'The CosSimVecMatLayer must have 2 inputs.') config_assert( size * self.get_input_layer(0).size == self.get_input_layer(1).size, - 'Wrong input size for CosSimVecMatLayer') + 'Wrong input size for CosSimVecMatLayer.') @config_layer('l2_distance') @@ -3520,11 +3534,17 @@ def ExpressionLayer(name, inputs, **xargs): @config_layer('concat') class ConcatenateLayer(LayerBase): + layer_type = 'concat' + def __init__(self, name, inputs, bias=False, **xargs): config_assert(inputs, 'inputs cannot be empty') config_assert(not bias, 'ConcatenateLayer cannot support bias.') + use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0))) + if self.layer_type == "mkldnn_concat": + config_assert(use_mkldnn, "mkldnn_concat only support MKLDNN") + self.layer_type = 'mkldnn_concat' if use_mkldnn else 'concat' super(ConcatenateLayer, self).__init__( - name, 'concat', 0, inputs=inputs, **xargs) + name, self.layer_type, 0, inputs=inputs, **xargs) size = 0 for input_index in xrange(len(self.inputs)): assert self.get_input_layer(0).height == self.get_input_layer( @@ -3544,6 +3564,11 @@ class ConcatenateLayer(LayerBase): self.set_layer_size(size) +@config_layer('mkldnn_concat') +class MKLDNNConcatLayer(ConcatenateLayer): + layer_type = 'mkldnn_concat' + + # like concat layer, but each input layer was processed by a Projection. @config_layer('concat2') class ConcatenateLayer2(LayerBase): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index e8f4f0035d8322b1fc2477bfd1a86806ecd6ac7b..5b39a65d8cbec7046dc8af1be668bed6c9aea5b5 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -116,6 +116,7 @@ __all__ = [ 'huber_classification_cost', 'block_expand_layer', 'maxout_layer', + 'dot_prod_layer', 'out_prod_layer', 'printer_layer', 'print_layer', @@ -199,6 +200,7 @@ class LayerType(object): SCALING_LAYER = 'scaling' TRANS_LAYER = 'trans' ROTATE_LAYER = 'rotate' + DOT_PROD_LAYER = 'dot_prod' OUT_PROD_LAYER = 'out_prod' FEATURE_MAP_EXPAND_LAYER = 'featmap_expand' @@ -3034,8 +3036,10 @@ def img_cmrnorm_layer(input, layer_attr=None): """ Response normalization across feature maps. - The details please refer to - `Alex's paper `_. + + Reference: + ImageNet Classification with Deep Convolutional Neural Networks + http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf The example usage is: @@ -3044,7 +3048,7 @@ def img_cmrnorm_layer(input, norm = img_cmrnorm_layer(input=net, size=5) :param name: The name of this layer. It is optional. - :type name: None | basestring + :type name: basestring :param input: The input of this layer. :type input: LayerOutput :param size: Normalize in number of :math:`size` feature maps. @@ -3053,9 +3057,11 @@ def img_cmrnorm_layer(input, :type scale: float :param power: The hyper-parameter. :type power: float - :param num_channels: input layer's filers number or channels. If - num_channels is None, it will be set automatically. - :param layer_attr: Extra Layer Attribute. + :param num_channels: The number of input channels. If the parameter is not set or + set to None, its actual value will be automatically set to + the channels number of the input. + :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute :return: LayerOutput object. :rtype: LayerOutput @@ -3083,7 +3089,7 @@ def batch_norm_layer(input, use_global_stats=None, mean_var_names=None): """ - Batch Normalization Layer. The notation of this layer as follow. + Batch Normalization Layer. The notation of this layer is as follows. :math:`x` is the input features over a mini-batch. @@ -3097,8 +3103,10 @@ def batch_norm_layer(input, \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift - The details of batch normalization please refer to this - `paper `_. + Reference: + Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift + http://arxiv.org/abs/1502.03167 The example usage is: @@ -3108,48 +3116,47 @@ def batch_norm_layer(input, :param name: The name of this layer. It is optional. :type name: basestring - :param input: batch normalization input. Better be linear activation. - Because there is an activation inside batch_normalization. + :param input: This layer's input which is to be performed batch normalization on. :type input: LayerOutput :param batch_norm_type: We have batch_norm, mkldnn_batch_norm and cudnn_batch_norm. batch_norm supports CPU, MKLDNN and GPU. cudnn_batch_norm requires cuDNN version greater or equal to v4 (>=v4). But cudnn_batch_norm is faster and needs less memory than batch_norm. mkldnn_batch_norm requires - enable use_mkldnn. By default (None), we will - automaticly select cudnn_batch_norm for GPU, + use_mkldnn is enabled. By default (None), we will + automatically select cudnn_batch_norm for GPU, mkldnn_batch_norm for MKLDNN and batch_norm for CPU. - Otherwise, select batch norm type based on the - specified type. If you use cudnn_batch_norm, - we suggested you use latest version, such as v5.1. + Users can specify the batch norm type. If you use + cudnn_batch_norm, we suggested you use latest version, + such as v5.1. :type batch_norm_type: None | string, None or "batch_norm" or "cudnn_batch_norm" or "mkldnn_batch_norm" - :param act: Activation Type. Better be relu. Because batch - normalization will normalize input near zero. + :param act: Activation type. ReluActivation is the default activation. :type act: BaseActivation - :param num_channels: num of image channels or previous layer's number of - filters. None will automatically get from layer's - input. + :param num_channels: The number of input channels. If the parameter is not set or + set to None, its actual value will be automatically set to + the channels number of the input. :type num_channels: int - :param bias_attr: :math:`\\beta`, better be zero when initialize. So the - initial_std=0, initial_mean=1 is best practice. + :param bias_attr: :math:`\\beta`. The bias attribute. If the parameter is set to + False or an object whose type is not ParameterAttribute, no + bias is defined. If the parameter is set to True, the bias is + initialized to zero. :type bias_attr: ParameterAttribute | None | bool | Any - :param param_attr: :math:`\\gamma`, better be one when initialize. So the - initial_std=0, initial_mean=1 is best practice. + :param param_attr: :math:`\\gamma`. The parameter attribute. See ParameterAttribute + for details. :type param_attr: ParameterAttribute - :param layer_attr: Extra Layer Attribute. + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute - :param use_global_stats: whether use moving mean/variance statistics - during testing peroid. If None or True, - it will use moving mean/variance statistics during - testing. If False, it will use the mean - and variance of current batch of test data for - testing. + :param use_global_stats: Whether use moving mean/variance statistics during + testing peroid. If the parameter is set to None or + True, it will use moving mean/variance statistics + during testing. If the parameter is set to False, it + will use the mean and variance of the current batch + of test data. :type use_global_stats: bool | None. - :param moving_average_fraction: Factor used in the moving average - computation, referred to as facotr, - :math:`runningMean = newMean*(1-factor) - + runningMean*factor` + :param moving_average_fraction: Factor used in the moving average computation. + :math:`runningMean = newMean*(1-factor) + runningMean*factor` :type moving_average_fraction: float. :param mean_var_names: [mean name, variance name] :type mean_var_names: string list @@ -3211,8 +3218,9 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None): :type input: LayerOutput :param name: The name of this layer. It is optional. :type name: basestring - :param layer_attr: extra layer attributes. - :type layer_attr: ExtraLayerAttribute. + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute + for details. + :type layer_attr: ExtraLayerAttribute :return: LayerOutput object. :rtype: LayerOutput """ @@ -3247,7 +3255,8 @@ def row_l2_norm_layer(input, name=None, layer_attr=None): :type input: LayerOutput :param name: The name of this layer. It is optional. :type name: basestring - :param layer_attr: extra layer attributes. + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute + for details. :type layer_attr: ExtraLayerAttribute. :return: LayerOutput object. :rtype: LayerOutput @@ -3284,22 +3293,17 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None): act=ReluActivation(), bias_attr=False) - This layer just simply add all input layers together, then activate the sum - inputs. Each input of this layer should be the same size, which is also the - output size of this layer. + This layer just simply adds all input layers together, then activates the + sum. All inputs should share the same dimension, which is also the dimension + of this layer's output. There is no weight matrix for each input, because it just a simple add operation. If you want a complicated operation before add, please use mixed_layer. - It is a very good way to set dropout outside the layers. Since not all - PaddlePaddle layer support dropout, you can add an add_to layer, set - dropout here. - Please refer to dropout_layer for details. - :param name: The name of this layer. It is optional. :type name: basestring - :param input: Input layers. It could be a LayerOutput or list/tuple of + :param input: The input layers. It could be a LayerOutput or list/tuple of LayerOutput. :type input: LayerOutput | list | tuple :param act: Activation Type. LinearActivation is the default activation. @@ -3308,7 +3312,8 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None): whose type is not ParameterAttribute, no bias is defined. If the parameter is set to True, the bias is initialized to zero. :type bias_attr: ParameterAttribute | None | bool | Any - :param layer_attr: Extra Layer attribute. + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute :return: LayerOutput object. :rtype: LayerOutput @@ -3347,8 +3352,8 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None): @layer_support(DROPOUT, ERROR_CLIPPING) def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None): """ - Concat all input vector into one huge vector. - Inputs can be list of LayerOutput or list of projection. + Concatenate all input vectors to one vector. + Inputs can be a list of LayerOutput or a list of projection. The example usage is: @@ -3358,11 +3363,12 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None): :param name: The name of this layer. It is optional. :type name: basestring - :param input: input layers or projections + :param input: The input layers or projections :type input: list | tuple | collections.Sequence :param act: Activation type. IdentityActivation is the default activation. :type act: BaseActivation - :param layer_attr: Extra Layer Attribute. + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute :return: LayerOutput object. :rtype: LayerOutput @@ -3432,7 +3438,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None): def seq_concat_layer(a, b, act=None, name=None, layer_attr=None, bias_attr=None): """ - Concat sequence a with sequence b. + Concatenate sequence a and sequence b. Inputs: - a = [a1, a2, ..., am] @@ -3451,13 +3457,14 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None, :param name: The name of this layer. It is optional. :type name: basestring - :param a: input sequence layer + :param a: The first input sequence layer :type a: LayerOutput - :param b: input sequence layer + :param b: The second input sequence layer :type b: LayerOutput :param act: Activation type. IdentityActivation is the default activation. :type act: BaseActivation - :param layer_attr: Extra Layer Attribute. + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute :param bias_attr: The bias attribute. If the parameter is set to False or an object whose type is not ParameterAttribute, no bias is defined. If the @@ -3494,31 +3501,25 @@ def memory(name, boot_bias_active_type=None, boot_with_const_id=None): """ - The memory layers is a layer cross each time step. Reference this output - as previous time step layer :code:`name` 's output. - - The default memory is zero in first time step, previous time step's - output in the rest time steps. + The memory takes a layer's output at previous time step as its own output. - If boot_bias, the first time step value is this bias and - with activation. + If boot_bias, the activation of the bias is the initial value of the memory. - If boot_with_const_id, then the first time stop is a IndexSlot, the - Arguments.ids()[0] is this :code:`cost_id`. + If boot_with_const_id is set, then the memory's output at the first time step + is a IndexSlot, the Arguments.ids()[0] is this :code:`cost_id`. - If boot_layer is not null, the memory is just the boot_layer's output. - Set :code:`is_seq` is true boot layer is sequence. + If boot_layer is specified, the memory's output at the first time step will + be the boot_layer's output. - The same name layer in recurrent group will set memory on each time - step. + In other case, the default memory's output at the first time step is zero. .. code-block:: python mem = memory(size=256, name='state') state = fc_layer(input=mem, size=256, name='state') - If you do not want to specify the name, you can equivalently use set_input() - to specify the layer needs to be remembered as the following: + If you do not want to specify the name, you can also use set_input() + to specify the layer to be remembered as the following: .. code-block:: python @@ -3526,26 +3527,31 @@ def memory(name, state = fc_layer(input=mem, size=256) mem.set_input(mem) - :param name: the name of the layer which this memory remembers. + :param name: The name of the layer which this memory remembers. If name is None, user should call set_input() to specify the name of the layer which this memory remembers. :type name: basestring - :param size: size of memory. + :param size: The dimensionality of memory. :type size: int - :param memory_name: the name of the memory. - It is ignored when name is provided. + :param memory_name: The name of the memory. It is ignored when name is provided. :type memory_name: basestring :param is_seq: DEPRECATED. is sequence for boot_layer :type is_seq: bool - :param boot_layer: boot layer of memory. + :param boot_layer: This parameter specifies memory's output at the first time + step and the output is boot_layer's output. :type boot_layer: LayerOutput | None - :param boot_bias: boot layer's bias + :param boot_bias: The bias attribute of memory's output at the first time step. + If the parameter is set to False or an object whose type is not + ParameterAttribute, no bias is defined. If the parameter is set + to True, the bias is initialized to zero. :type boot_bias: ParameterAttribute | None - :param boot_bias_active_type: boot layer's active type. + :param boot_bias_active_type: Activation type for memory's bias at the first time + step. LinearActivation is the default activation. :type boot_bias_active_type: BaseActivation - :param boot_with_const_id: boot layer's id. + :param boot_with_const_id: This parameter specifies memory's output at the first + time step and the output is an index. :type boot_with_const_id: int - :return: LayerOutput object which is a memory. + :return: LayerOutput object. :rtype: LayerOutput """ if boot_bias_active_type is None: @@ -4183,6 +4189,45 @@ def maxid_layer(input, name=None, layer_attr=None): size=l.config.size) +@wrap_name_default() +def dot_prod_layer(input1, input2, name=None, layer_attr=None): + """ + A layer for computing the dot product of two vectors. + + The example usage is: + + .. code-block:: python + + dot_prod = dot_prod_layer(input1=vec1, input2=vec2) + + :param name: The name of this layer. It is optional. + :type name: basestring + :param input1: The first input layer. + :type input: LayerOutput + :param input2: The second input layer. + :type input2: LayerOutput + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. + :type layer_attr: ExtraLayerAttribute. + :return: LayerOutput object. + :rtype: LayerOutput + """ + assert isinstance(input1, LayerOutput) + assert isinstance(input2, LayerOutput) + assert input1.size == input2.size, ("Two inputs should have the same size.") + + l = Layer( + name=name, + type=LayerType.DOT_PROD_LAYER, + inputs=[input1.name, input2.name], + **ExtraLayerAttribute.to_kwargs(layer_attr)) + return LayerOutput( + name=name, + layer_type=LayerType.DOT_PROD_LAYER, + parents=[input1, input2], + size=l.config.size) + + @wrap_name_default() def out_prod_layer(input1, input2, name=None, layer_attr=None): """ diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh index 5014c14b8f0ca6db51ab958e9779a20bb9035ef6..6c09ca3d346e36b6f38ac5c89914e95a7383f719 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh @@ -11,6 +11,7 @@ test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_l test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer -test_scale_sub_region_layer test_l2_distance_layer) +test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer +test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer) export whole_configs=(test_split_datasource) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr new file mode 100644 index 0000000000000000000000000000000000000000..f1530c382c3d81a82592af2c43c06eb4278e2b4a --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr @@ -0,0 +1,38 @@ +type: "nn" +layers { + name: "vector1" + type: "data" + size: 10 + active_type: "" +} +layers { + name: "vector2" + type: "data" + size: 10 + active_type: "" +} +layers { + name: "__dot_prod_layer_0__" + type: "dot_prod" + size: 1 + active_type: "" + inputs { + input_layer_name: "vector1" + } + inputs { + input_layer_name: "vector2" + } +} +input_layer_names: "vector1" +input_layer_names: "vector2" +output_layer_names: "__dot_prod_layer_0__" +sub_models { + name: "root" + layer_names: "vector1" + layer_names: "vector2" + layer_names: "__dot_prod_layer_0__" + input_layer_names: "vector1" + input_layer_names: "vector2" + output_layer_names: "__dot_prod_layer_0__" + is_recurrent_layer_group: false +} diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..e52d48dde0084aacd3f7874cc384d59287a0c7d5 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py @@ -0,0 +1,7 @@ +from paddle.trainer_config_helpers import * + +vec1 = data_layer(name='vector1', size=10) +vec2 = data_layer(name='vector2', size=10) +dot_product = dot_prod_layer(input1=vec1, input2=vec2) + +outputs(dot_product) diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index 43df089363782b89524138b15fb4d8ea37abbca9..7bbe3eaaa67a117bc53571e6571365c3a26814c1 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -76,6 +76,31 @@ def init(**kwargs): for key in args_dict.keys(): args.append('--%s=%s' % (key, str(args_dict[key]))) + # auto set cpu environment + def set_env(key, value): + '''If the key has not been set in the environment, set it with value.''' + assert isinstance(key, str) + assert isinstance(value, str) + envset = os.environ.get(key) + if envset is None: + os.environ[key] = value + + ht = os.popen("lscpu |grep \"per core\"|awk -F':' '{print $2}'|xargs") + ht = int(ht.read()) + if ht == 1: # ht is off + set_env("OMP_DYNAMIC", "false") + set_env("KMP_AFFINITY", "granularity=fine,compact,0,0") + else: + set_env("OMP_DYNAMIC", "true") + set_env("KMP_AFFINITY", "granularity=fine,compact,1,0") + processors = os.popen("grep \"processor\" /proc/cpuinfo|sort -u|wc -l") + processors = int(processors.read()) + trainers = kwargs.get('trainer_count', 1) + threads = processors / trainers + threads = '1' if threads < 1 else str(threads) + set_env("OMP_NUM_THREADS", threads) + set_env("MKL_NUM_THREADS", threads) + if 'use_gpu' in kwargs: cp.g_command_config_args['use_gpu'] = kwargs['use_gpu'] if 'use_mkldnn' in kwargs: diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index f20567243ae67baecbdbac13f879f4cf2f66d298..acca6ba35ced8674d4eec7dc57e41673c90cf8f8 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -4,7 +4,10 @@ import collections import numpy as np import copy -__all__ = ['Block', 'Variable', 'Program', 'Operator', 'default_startup_program', 'default_main_program'] +__all__ = [ + 'Block', 'Variable', 'Program', 'Operator', 'default_startup_program', + 'default_main_program' +] def unique_name(prefix): @@ -12,9 +15,9 @@ def unique_name(prefix): return "_".join([prefix, str(uid)]) -def _debug_string_(proto): +def _debug_string_(proto, throw_on_error=True): error_fields = list() - if not proto.IsInitialized(error_fields): + if not proto.IsInitialized(error_fields) and throw_on_error: raise ValueError("{0} are not initialized\nThe message is {1}".format( error_fields, proto)) return proto.__str__() @@ -101,9 +104,12 @@ class Variable(object): self.stop_gradient = stop_gradient def __str__(self): + return self.to_string(True) + + def to_string(self, throw_on_error): protostr = self.desc.serialize_to_string() proto = framework_pb2.VarDesc.FromString(str(protostr)) - return _debug_string_(proto) + return _debug_string_(proto, throw_on_error) __repr__ = __str__ @@ -229,17 +235,17 @@ class Operator(object): in_proto.name) if found: - in_argus = inputs[in_proto.name] - if not isinstance(in_argus, list): - in_argus = [in_argus] - if not in_proto.duplicable and len(in_argus) > 1: + in_args = inputs[in_proto.name] + if not isinstance(in_args, list): + in_args = [in_args] + if not in_proto.duplicable and len(in_args) > 1: raise ValueError( "Input %s expects only one input, but %d are given." - % (in_proto.name, len(in_argus))) - in_argu_names = [] - for argu in in_argus: - in_argu_names.append(argu.name) - self.desc.set_input(in_proto.name, in_argu_names) + % (in_proto.name, len(in_args))) + in_arg_names = [] + for arg in in_args: + in_arg_names.append(arg.name) + self.desc.set_input(in_proto.name, in_arg_names) else: self.desc.set_input(in_proto.name, []) @@ -257,18 +263,18 @@ class Operator(object): str(e) for e in given))) for out_proto in proto.outputs: - out_argus = outputs[out_proto.name] - if not isinstance(out_argus, list): - out_argus = [out_argus] - if not out_proto.duplicable and len(out_argus) > 1: + out_args = outputs[out_proto.name] + if not isinstance(out_args, list): + out_args = [out_args] + if not out_proto.duplicable and len(out_args) > 1: raise ValueError( "Output %s expects only one output, but %d are given." % - (out_proto.name, len(out_argus))) - out_argu_names = [] - for argu in out_argus: - out_argu_names.append(argu.name) - argu.op = self - self.desc.set_output(out_proto.name, out_argu_names) + (out_proto.name, len(out_args))) + out_arg_names = [] + for arg in out_args: + out_arg_names.append(arg.name) + arg.op = self + self.desc.set_output(out_proto.name, out_arg_names) if attrs is not None: if not isinstance(attrs, dict): @@ -291,10 +297,13 @@ class Operator(object): self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) - def __str__(self): + def to_string(self, throw_on_error): protostr = self.desc.serialize_to_string() proto = framework_pb2.OpDesc.FromString(str(protostr)) - return _debug_string_(proto) + return _debug_string_(proto, throw_on_error) + + def __str__(self): + return self.to_string(True) __repr__ = __str__ @@ -349,9 +358,12 @@ class Block(object): self.program = program def __str__(self): + return self.to_string(True) + + def to_string(self, throw_on_error): protostr = self.desc.serialize_to_string() proto = framework_pb2.BlockDesc.FromString(str(protostr)) - return _debug_string_(proto) + return _debug_string_(proto, throw_on_error) __repr__ = __str__ @@ -454,9 +466,12 @@ class Program(object): self.current_block_idx = 0 def __str__(self): + return self.to_string(True) + + def to_string(self, throw_on_error): protostr = self.desc.serialize_to_string() proto = framework_pb2.ProgramDesc.FromString(str(protostr)) - return _debug_string_(proto) + return _debug_string_(proto, throw_on_error) def clone(self): p = Program() @@ -512,7 +527,14 @@ class Program(object): assert isinstance(target, Variable) if no_grad_set is None: no_grad_set = set() - param_to_grad_info = self.desc.append_backward(target.desc, no_grad_set) + try: + param_to_grad_info = self.desc.append_backward(target.desc, + no_grad_set) + except Exception as e: + raise core.EnforceNotMet( + str(e) + "\nCurrent protobuf is\n{0}".format( + self.to_string(False))) + self.sync_with_cpp() return param_to_grad_info @@ -563,8 +585,10 @@ class Parameter(Variable): g_main_program = Program() g_startup_program = Program() + def default_startup_program(): return g_startup_program + def default_main_program(): return g_main_program diff --git a/python/paddle/v2/fluid/net_drawer.py b/python/paddle/v2/fluid/net_drawer.py index 17ad547c2bb5b79ef8225dd1a8f1ef49a6572508..94fdd5e38970b309580de6fc934b158a3c46e464 100644 --- a/python/paddle/v2/fluid/net_drawer.py +++ b/python/paddle/v2/fluid/net_drawer.py @@ -66,10 +66,13 @@ def parse_graph(program, graph, var_dict, **kwargs): if not var_dict.has_key(var): var_dict[var] = "Feed" + temp_id = 0 proto = framework_pb2.ProgramDesc.FromString( program.desc.serialize_to_string()) for block in proto.blocks: for op in block.ops: + op.type = op.type + "_" + str(temp_id) + temp_id += 1 graph.node(**draw_node(op)) for o in op.outputs: for arg in o.arguments: @@ -78,6 +81,7 @@ def parse_graph(program, graph, var_dict, **kwargs): for arg in e.arguments: if var_dict.has_key(arg): graph.edge(**draw_edge(var_dict, op, e, arg)) + break # only plot the first block def draw_graph(startup_program, main_program, **kwargs): diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py index ee677a2c5670a092c509b9ce1c555223bf22957f..a7f3bfc0caf76302674a00c80c2bd9ebf834f872 100644 --- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py +++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py @@ -1,33 +1,22 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.framework as framework -from paddle.v2.fluid.io import save_persistables, load_persistables +import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor +from paddle.v2.fluid.io import save_persistables, load_persistables +from paddle.v2.fluid.optimizer import SGDOptimizer -import numpy as np - -x = layers.data( - name='x', - shape=[13], - data_type='float32') +x = layers.data(name='x', shape=[13], data_type='float32') -y_predict = layers.fc(input=x, - size=1, - act=None) +y_predict = layers.fc(input=x, size=1, act=None) -y = layers.data( - name='y', - shape=[1], - data_type='float32') +y = layers.data(name='y', shape=[1], data_type='float32') -cost = layers.square_error_cost( - input=y_predict, - label=y) +cost = layers.square_error_cost(input=y_predict, label=y) avg_cost = layers.mean(x=cost) -sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) +sgd_optimizer = SGDOptimizer(learning_rate=0.001) opts = sgd_optimizer.minimize(avg_cost) BATCH_SIZE = 20 diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py index f4be835b3ad57d5b0076e8a816c2c3def46e0663..b8506125501b6e533c4594b37943ec36ca8e7d30 100644 --- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py +++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py @@ -1,21 +1,16 @@ import numpy as np import paddle.v2 as paddle import paddle.v2.fluid.core as core +import paddle.v2.fluid.framework as framework import paddle.v2.fluid.layers as layers import paddle.v2.fluid.nets as nets -import paddle.v2.fluid.optimizer as optimizer from paddle.v2.fluid.executor import Executor -import paddle.v2.fluid.framework as framework from paddle.v2.fluid.initializer import XavierInitializer +from paddle.v2.fluid.optimizer import AdamOptimizer def resnet_cifar10(input, depth=32): - def conv_bn_layer(input, - ch_out, - filter_size, - stride, - padding, - act='relu'): + def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): tmp = layers.conv2d( input=input, filter_size=filter_size, @@ -24,9 +19,7 @@ def resnet_cifar10(input, depth=32): padding=padding, act=None, bias_attr=False) - return layers.batch_norm( - input=tmp, - act=act) + return layers.batch_norm(input=tmp, act=act) def shortcut(input, ch_in, ch_out, stride, program, init_program): if ch_in != ch_out: @@ -35,28 +28,11 @@ def resnet_cifar10(input, depth=32): else: return input - def basicblock(input, - ch_in, - ch_out, - stride): - tmp = conv_bn_layer( - input, - ch_out, - 3, - stride, - 1) - tmp = conv_bn_layer( - tmp, - ch_out, - 3, - 1, - 1, - act=None) + def basicblock(input, ch_in, ch_out, stride): + tmp = conv_bn_layer(input, ch_out, 3, stride, 1) + tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None) short = shortcut(input, ch_in, ch_out, stride) - return layers.elementwise_add( - x=tmp, - y=short, - act='relu') + return layers.elementwise_add(x=tmp, y=short, act='relu') def layer_warp(block_func, input, ch_in, ch_out, count, stride): tmp = block_func(input, ch_in, ch_out, stride) @@ -67,45 +43,17 @@ def resnet_cifar10(input, depth=32): assert (depth - 2) % 6 == 0 n = (depth - 2) / 6 conv1 = conv_bn_layer( - input=input, - ch_out=16, - filter_size=3, - stride=1, - padding=1) - res1 = layer_warp( - basicblock, - conv1, - 16, - 16, - n, - 1) - res2 = layer_warp( - basicblock, - res1, - 16, - 32, - n, - 2) - res3 = layer_warp( - basicblock, - res2, - 32, - 64, - n, - 2) + input=input, ch_out=16, filter_size=3, stride=1, padding=1) + res1 = layer_warp(basicblock, conv1, 16, 16, n, 1) + res2 = layer_warp(basicblock, res1, 16, 32, n, 2) + res3 = layer_warp(basicblock, res2, 32, 64, n, 2) pool = layers.pool2d( - input=res3, - pool_size=8, - pool_type='avg', - pool_stride=1) + input=res3, pool_size=8, pool_type='avg', pool_stride=1) return pool def vgg16_bn_drop(input): - def conv_block(input, - num_filter, - groups, - dropouts): + def conv_block(input, num_filter, groups, dropouts): return nets.img_conv_group( input=input, pool_size=2, @@ -123,22 +71,14 @@ def vgg16_bn_drop(input): conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) - drop = layers.dropout( - x=conv5, - dropout_prob=0.5) + drop = layers.dropout(x=conv5, dropout_prob=0.5) fc1 = layers.fc(input=drop, size=512, act=None, param_attr={"initializer": XavierInitializer()}) - reshape1 = layers.reshape( - x=fc1, - shape=list(fc1.shape + (1, 1))) - bn = layers.batch_norm( - input=reshape1, - act='relu') - drop2 = layers.dropout( - x=bn, - dropout_prob=0.5) + reshape1 = layers.reshape(x=fc1, shape=list(fc1.shape + (1, 1))) + bn = layers.batch_norm(input=reshape1, act='relu') + drop2 = layers.dropout(x=bn, dropout_prob=0.5) fc2 = layers.fc(input=drop2, size=512, act=None, @@ -165,8 +105,8 @@ cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(x=cost) accuracy = layers.accuracy(input=predict, label=label) -# optimizer = optimizer.SGDOptimizer(learning_rate=0.001) -optimizer = optimizer.AdamOptimizer(learning_rate=0.001) +# optimizer = SGDOptimizer(learning_rate=0.001) +optimizer = AdamOptimizer(learning_rate=0.001) opts = optimizer.minimize(avg_cost) BATCH_SIZE = 128 diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py index f330ff58137068e429008bc7aa07bbc8d2e35ac4..75fbaf83e8f3e62eb0d0abef9cfa267b65e72973 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py @@ -1,22 +1,15 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers -import paddle.v2.fluid.nets as nets import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.evaluator as evaluator import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers +import paddle.v2.fluid.nets as nets from paddle.v2.fluid.executor import Executor +from paddle.v2.fluid.optimizer import AdamOptimizer -import numpy as np - -images = layers.data( - name='pixel', - shape=[1, 28, 28], - data_type='float32') -label = layers.data( - name='label', - shape=[1], - data_type='int64') +images = layers.data(name='pixel', shape=[1, 28, 28], data_type='float32') +label = layers.data(name='label', shape=[1], data_type='int64') conv_pool_1 = nets.simple_img_conv_pool( input=images, filter_size=5, @@ -32,17 +25,13 @@ conv_pool_2 = nets.simple_img_conv_pool( pool_stride=2, act="relu") -predict = layers.fc(input=conv_pool_2, - size=10, - act="softmax") +predict = layers.fc(input=conv_pool_2, size=10, act="softmax") cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(x=cost) -optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999) +optimizer = AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999) opts = optimizer.minimize(avg_cost) -accuracy, acc_out = evaluator.accuracy( - input=predict, - label=label) +accuracy, acc_out = evaluator.accuracy(input=predict, label=label) BATCH_SIZE = 50 PASS_NUM = 3 diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py index b0164e3e3659c19edf2af45e706fb48ac1fe2b1c..cf10b1942e6a8243b18b0ae4586fdd7ec1a665fb 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py @@ -1,19 +1,15 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.regularizer import L2DecayRegularizer from paddle.v2.fluid.initializer import UniformInitializer - -import numpy as np +from paddle.v2.fluid.optimizer import MomentumOptimizer +from paddle.v2.fluid.regularizer import L2DecayRegularizer BATCH_SIZE = 128 -image = layers.data( - name='x', - shape=[784], - data_type='float32') +image = layers.data(name='x', shape=[784], data_type='float32') param_attr = { 'name': None, @@ -22,32 +18,21 @@ param_attr = { 'regularization': L2DecayRegularizer(0.0005 * BATCH_SIZE) } -hidden1 = layers.fc(input=image, - size=128, - act='relu', - param_attr=param_attr) -hidden2 = layers.fc(input=hidden1, - size=64, - act='relu', - param_attr=param_attr) +hidden1 = layers.fc(input=image, size=128, act='relu', param_attr=param_attr) +hidden2 = layers.fc(input=hidden1, size=64, act='relu', param_attr=param_attr) predict = layers.fc(input=hidden2, size=10, act='softmax', param_attr=param_attr) -label = layers.data( - name='y', - shape=[1], - data_type='int64') +label = layers.data(name='y', shape=[1], data_type='int64') cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(x=cost) -accuracy = layers.accuracy( - input=predict, - label=label) +accuracy = layers.accuracy(input=predict, label=label) -optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9) +optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) opts = optimizer.minimize(avg_cost) train_reader = paddle.batch( diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py index eefcb55bebff41eb9c67d9f0c8e83a5f1d4599bd..55ded3aed3a23c8cd7795f915dc1cbd512c6d945 100644 --- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py +++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py @@ -1,12 +1,11 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers -import paddle.v2.fluid.nets as nets import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers +import paddle.v2.fluid.nets as nets from paddle.v2.fluid.executor import Executor - -import numpy as np +from paddle.v2.fluid.optimizer import SGDOptimizer IS_SPARSE = True USE_GPU = False @@ -19,10 +18,7 @@ def get_usr_combined_features(): USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1 - uid = layers.data( - name='user_id', - shape=[1], - data_type='int64') + uid = layers.data(name='user_id', shape=[1], data_type='int64') usr_emb = layers.embedding( input=uid, @@ -31,15 +27,11 @@ def get_usr_combined_features(): param_attr={'name': 'user_table'}, is_sparse=IS_SPARSE) - usr_fc = layers.fc(input=usr_emb, - size=32) + usr_fc = layers.fc(input=usr_emb, size=32) USR_GENDER_DICT_SIZE = 2 - usr_gender_id = layers.data( - name='gender_id', - shape=[1], - data_type='int64') + usr_gender_id = layers.data(name='gender_id', shape=[1], data_type='int64') usr_gender_emb = layers.embedding( input=usr_gender_id, @@ -47,14 +39,10 @@ def get_usr_combined_features(): param_attr={'name': 'gender_table'}, is_sparse=IS_SPARSE) - usr_gender_fc = layers.fc(input=usr_gender_emb, - size=16) + usr_gender_fc = layers.fc(input=usr_gender_emb, size=16) USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table) - usr_age_id = layers.data( - name='age_id', - shape=[1], - data_type="int64") + usr_age_id = layers.data(name='age_id', shape=[1], data_type="int64") usr_age_emb = layers.embedding( input=usr_age_id, @@ -62,14 +50,10 @@ def get_usr_combined_features(): is_sparse=IS_SPARSE, param_attr={'name': 'age_table'}) - usr_age_fc = layers.fc(input=usr_age_emb, - size=16) + usr_age_fc = layers.fc(input=usr_age_emb, size=16) USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1 - usr_job_id = layers.data( - name='job_id', - shape=[1], - data_type="int64") + usr_job_id = layers.data(name='job_id', shape=[1], data_type="int64") usr_job_emb = layers.embedding( input=usr_job_id, @@ -77,16 +61,12 @@ def get_usr_combined_features(): param_attr={'name': 'job_table'}, is_sparse=IS_SPARSE) - usr_job_fc = layers.fc(input=usr_job_emb, - size=16) + usr_job_fc = layers.fc(input=usr_job_emb, size=16) concat_embed = layers.concat( - input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], - axis=1) + input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1) - usr_combined_features = layers.fc(input=concat_embed, - size=200, - act="tanh") + usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") return usr_combined_features @@ -95,10 +75,7 @@ def get_mov_combined_features(): MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1 - mov_id = layers.data( - name='movie_id', - shape=[1], - data_type='int64') + mov_id = layers.data(name='movie_id', shape=[1], data_type='int64') mov_emb = layers.embedding( input=mov_id, @@ -107,36 +84,24 @@ def get_mov_combined_features(): param_attr={'name': 'movie_table'}, is_sparse=IS_SPARSE) - mov_fc = layers.fc(input=mov_emb, - size=32) + mov_fc = layers.fc(input=mov_emb, size=32) CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories()) - category_id = layers.data( - name='category_id', - shape=[1], - data_type='int64') + category_id = layers.data(name='category_id', shape=[1], data_type='int64') mov_categories_emb = layers.embedding( - input=category_id, - size=[CATEGORY_DICT_SIZE, 32], - is_sparse=IS_SPARSE) + input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE) mov_categories_hidden = layers.sequence_pool( - input=mov_categories_emb, - pool_type="sum") + input=mov_categories_emb, pool_type="sum") MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict()) - mov_title_id = layers.data( - name='movie_title', - shape=[1], - data_type='int64') + mov_title_id = layers.data(name='movie_title', shape=[1], data_type='int64') mov_title_emb = layers.embedding( - input=mov_title_id, - size=[MOV_TITLE_DICT_SIZE, 32], - is_sparse=IS_SPARSE) + input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE) mov_title_conv = nets.sequence_conv_pool( input=mov_title_emb, @@ -146,13 +111,10 @@ def get_mov_combined_features(): pool_type="sum") concat_embed = layers.concat( - input=[mov_fc, mov_categories_hidden, mov_title_conv], - axis=1) + input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1) # FIXME(dzh) : need tanh operator - mov_combined_features = layers.fc(input=concat_embed, - size=200, - act="tanh") + mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") return mov_combined_features @@ -162,18 +124,11 @@ def model(): mov_combined_features = get_mov_combined_features() # need cos sim - inference = layers.cos_sim( - X=usr_combined_features, - Y=mov_combined_features) + inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features) - label = layers.data( - name='score', - shape=[1], - data_type='float32') + label = layers.data(name='score', shape=[1], data_type='float32') - square_cost = layers.square_error_cost( - input=inference, - label=label) + square_cost = layers.square_error_cost(input=inference, label=label) avg_cost = layers.mean(x=square_cost) @@ -182,7 +137,7 @@ def model(): def main(): cost = model() - sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.2) + sgd_optimizer = SGDOptimizer(learning_rate=0.2) opts = sgd_optimizer.minimize(cost) if USE_GPU: diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py index 91fc79a9870a31205098d8a40de6c033d5bf60b9..e69b915a9cfaf9e06075991975563a1fc1196661 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py @@ -1,12 +1,11 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers -import paddle.v2.fluid.nets as nets import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers +import paddle.v2.fluid.nets as nets from paddle.v2.fluid.executor import Executor - -import numpy as np +from paddle.v2.fluid.optimizer import AdamOptimizer def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32): @@ -31,7 +30,7 @@ def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32): act="softmax") cost = layers.cross_entropy(input=prediction, label=label) avg_cost = layers.mean(x=cost) - adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002) + adam_optimizer = AdamOptimizer(learning_rate=0.002) opts = adam_optimizer.minimize(avg_cost) acc = layers.accuracy(input=prediction, label=label) return avg_cost, acc diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py index 8c3d4488354eb363cd1d378ebd4cb8069e7c1b1d..65d44542501e6531fc1912cbc726a1d903b9c031 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py @@ -1,12 +1,10 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers -import paddle.v2.fluid.nets as nets import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor - -import numpy as np +from paddle.v2.fluid.optimizer import AdamOptimizer def stacked_lstm_net(input_dim, @@ -41,7 +39,7 @@ def stacked_lstm_net(input_dim, act='softmax') cost = layers.cross_entropy(input=prediction, label=label) avg_cost = layers.mean(x=cost) - adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002) + adam_optimizer = AdamOptimizer(learning_rate=0.002) opts = adam_optimizer.minimize(avg_cost) acc = layers.accuracy(input=prediction, label=label) return avg_cost, acc diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py index a7d791c1f38d4843f084127e879d613b21ae8daf..280f6e902c34512735a27586221c2be68963ef2b 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py @@ -1,11 +1,10 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor - -import numpy as np +from paddle.v2.fluid.optimizer import AdamOptimizer def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50): @@ -33,7 +32,7 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50): cost = layers.cross_entropy(input=prediction, label=label) avg_cost = layers.mean(x=cost) - adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002) + adam_optimizer = AdamOptimizer(learning_rate=0.002) opts = adam_optimizer.minimize(avg_cost) acc = layers.accuracy(input=prediction, label=label) diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py index 9dcb6f2fea06ea8cd061be4f148854408779f990..afa7b285198e0349317e123e4bd98e8336217afa 100644 --- a/python/paddle/v2/fluid/tests/book/test_word2vec.py +++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py @@ -1,11 +1,10 @@ +import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.layers as layers import paddle.v2.fluid.core as core -import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor - -import numpy as np +from paddle.v2.fluid.optimizer import SGDOptimizer PASS_NUM = 100 EMBED_SIZE = 32 @@ -17,26 +16,11 @@ IS_SPARSE = True word_dict = paddle.dataset.imikolov.build_dict() dict_size = len(word_dict) -first_word = layers.data( - name='firstw', - shape=[1], - data_type='int64') -second_word = layers.data( - name='secondw', - shape=[1], - data_type='int64') -third_word = layers.data( - name='thirdw', - shape=[1], - data_type='int64') -forth_word = layers.data( - name='forthw', - shape=[1], - data_type='int64') -next_word = layers.data( - name='nextw', - shape=[1], - data_type='int64') +first_word = layers.data(name='firstw', shape=[1], data_type='int64') +second_word = layers.data(name='secondw', shape=[1], data_type='int64') +third_word = layers.data(name='thirdw', shape=[1], data_type='int64') +forth_word = layers.data(name='forthw', shape=[1], data_type='int64') +next_word = layers.data(name='nextw', shape=[1], data_type='int64') embed_first = layers.embedding( input=first_word, @@ -64,19 +48,12 @@ embed_forth = layers.embedding( param_attr={'name': 'shared_w'}) concat_embed = layers.concat( - input=[embed_first, embed_second, embed_third, embed_forth], - axis=1) -hidden1 = layers.fc(input=concat_embed, - size=HIDDEN_SIZE, - act='sigmoid') -predict_word = layers.fc(input=hidden1, - size=dict_size, - act='softmax') -cost = layers.cross_entropy( - input=predict_word, - label=next_word) + input=[embed_first, embed_second, embed_third, embed_forth], axis=1) +hidden1 = layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid') +predict_word = layers.fc(input=hidden1, size=dict_size, act='softmax') +cost = layers.cross_entropy(input=predict_word, label=next_word) avg_cost = layers.mean(x=cost) -sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) +sgd_optimizer = SGDOptimizer(learning_rate=0.001) opts = sgd_optimizer.minimize(avg_cost) train_reader = paddle.batch( diff --git a/python/paddle/v2/fluid/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py index 66bad349e59b608cb3cc965401c81ef4c716b318..903e84c32887100bbeef6ebf81f66f06f084fab5 100644 --- a/python/paddle/v2/fluid/tests/test_adagrad_op.py +++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py @@ -1,6 +1,9 @@ import unittest import numpy as np +import paddle.v2.fluid.core as core +from paddle.v2.fluid.op import Operator from op_test import OpTest +import math class TestAdagradOp1(OpTest): @@ -65,5 +68,110 @@ class TestAdagradOp2(OpTest): self.check_output() +class TestSparseAdagradOp(unittest.TestCase): + def check_with_place(self, place): + scope = core.Scope() + + # create and initialize Grad Variable + height = 10 + rows = [0, 4, 7, 4] + row_numel = 12 + + grad_selected_rows = scope.var('Grad').get_selected_rows() + grad_selected_rows.set_height(height) + grad_selected_rows.set_rows(rows) + np_array = np.ones((len(rows), row_numel)).astype("float32") + np_array[0, 0] = 2.0 + np_array[2, 8] = 4.0 + + grad_tensor = grad_selected_rows.get_tensor() + grad_tensor.set(np_array, place) + + # create and initialize Param Variable + param = scope.var('Param').get_tensor() + param_array = np.full((height, row_numel), 5.0).astype("float32") + param.set(param_array, place) + + # create and initialize LeraningRate Variable + lr = scope.var('LearningRate').get_tensor() + lr_array = np.full((1), 2.0).astype("float32") + lr.set(lr_array, place) + + # create and initialize moment Variable + moment = scope.var('Moment').get_tensor() + moment_np_array = np.full((height, row_numel), 2.0).astype("float32") + moment.set(moment_np_array, place) + + # create and run sgd operator + adagrad_op = Operator( + "adagrad", + Param='Param', + Grad='Grad', + ParamOut='Param', + Moment='Moment', + MomentOut='Moment', + LearningRate='LearningRate', + epsilon=2.0) + + ctx = core.DeviceContext.create(place) + adagrad_op.run(scope, ctx) + + # get and compare moment result + moment_result_array = np.array(moment) + + self.assertAlmostEqual(6.0, moment_result_array[rows[0], 0]) + self.assertAlmostEqual(3.0, moment_result_array[rows[0], 2]) + self.assertAlmostEqual(2.0, moment_result_array[1, 0]) + # 2.0 + (1.0 + 1.0)^2 + self.assertAlmostEqual(6.0, moment_result_array[rows[1], 10]) + self.assertAlmostEqual(6.0, moment_result_array[rows[3], 4]) + + self.assertAlmostEqual(2.0, moment_result_array[5, 8]) + self.assertAlmostEqual(3.0, moment_result_array[rows[2], 1]) + self.assertAlmostEqual(18.0, moment_result_array[rows[2], 8]) + + # get and compare param result + result_array = np.array(param) + + def get_out(param, lr, grad, m, epsilon): + return param - lr * grad / (math.sqrt(m) + epsilon) + + self.assertAlmostEqual( + get_out(5.0, 2.0, 2.0, 6.0, 2.0), + result_array[rows[0], 0], + places=5) + self.assertAlmostEqual( + get_out(5.0, 2.0, 1.0, 3.0, 2.0), + result_array[rows[0], 2], + places=5) + self.assertAlmostEqual( + get_out(5.0, 2.0, 0.0, 2.0, 2.0), result_array[1, 0], places=5) + + # grad_merge = 1.0 + 1.0 + # m = 6.0 + self.assertAlmostEqual( + get_out(5.0, 2.0, 2.0, 6.0, 2.0), + result_array[rows[1], 10], + places=5) + + self.assertAlmostEqual( + get_out(5.0, 2.0, 0.0, 2.0, 2.0), result_array[5, 8], places=5) + self.assertAlmostEqual( + get_out(5.0, 2.0, 1.0, 3.0, 2.0), + result_array[rows[2], 1], + places=5) + self.assertAlmostEqual( + get_out(5.0, 2.0, 4.0, 18.0, 2.0), + result_array[rows[2], 8], + places=5) + + def test_sparse_adagrad(self): + places = [core.CPUPlace()] + if core.is_compile_gpu(): + places.append(core.GPUPlace(0)) + for place in places: + self.check_with_place(place) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/v2/framework/tests/test_beam_search_op.py b/python/paddle/v2/fluid/tests/test_beam_search_op.py similarity index 94% rename from python/paddle/v2/framework/tests/test_beam_search_op.py rename to python/paddle/v2/fluid/tests/test_beam_search_op.py index a5a0cc0c96ab6c77abcc20c387eef7b3b1c8ac93..cc7c09bb59de3f83e47b4d95c1203f7f050c5132 100644 --- a/python/paddle/v2/framework/tests/test_beam_search_op.py +++ b/python/paddle/v2/fluid/tests/test_beam_search_op.py @@ -1,6 +1,6 @@ import logging -from paddle.v2.framework.op import Operator, DynamicRecurrentOp -import paddle.v2.framework.core as core +from paddle.v2.fluid.op import Operator, DynamicRecurrentOp +import paddle.v2.fluid.core as core import unittest import numpy as np diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py index 907b52c405d9e5c02c70f611e4c777ba21948c40..2240dc73cdd31f320fed174dd811e93c6640137f 100644 --- a/python/paddle/v2/fluid/tests/test_conv2d_op.py +++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py @@ -110,13 +110,30 @@ class TestConv2dOp(OpTest): self.op_type = "conv2d" +class TestWithPad(TestConv2dOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] / self.groups + self.filter_size = [6, f_c, 3, 3] + + +class TestWithStride(TestConv2dOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 6, 6] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] / self.groups + self.filter_size = [6, f_c, 3, 3] + + class TestWithGroup(TestConv2dOp): def init_group(self): self.groups = 3 - def init_op_type(self): - self.op_type = "conv2d" - class TestWith1x1(TestConv2dOp): def init_test_case(self): @@ -127,15 +144,9 @@ class TestWith1x1(TestConv2dOp): f_c = self.input_size[1] / self.groups self.filter_size = [6, f_c, 1, 1] - def init_dilation(self): - self.dilations = [1, 1] - def init_group(self): self.groups = 3 - def init_op_type(self): - self.op_type = "conv2d" - class TestWithDilation(TestConv2dOp): def init_test_case(self): @@ -152,14 +163,19 @@ class TestWithDilation(TestConv2dOp): def init_group(self): self.groups = 3 + +#----------------Conv2dCudnn---------------- +class TestCudnn(TestConv2dOp): def init_op_type(self): - self.op_type = "conv2d" + self.op_type = "conv_cudnn" -#----------------Conv2dCudnn---------------- +class TestCudnnWithPad(TestWithPad): + def init_op_type(self): + self.op_type = "conv_cudnn" -class TestCudnn(TestConv2dOp): +class TestCudnnWithStride(TestWithStride): def init_op_type(self): self.op_type = "conv_cudnn" diff --git a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py index 54349c018c4a53b8767d6cd4f94d99c719dc0237..d7b1f2f2a3abf6335998742dbbef8e17794170fa 100644 --- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py +++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py @@ -4,9 +4,7 @@ from op_test import OpTest def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param): - # [2, 3, 5, 5] in_n, in_c, in_h, in_w = input_.shape - # [3, 6, 3, 3] f_c, out_c, f_h, f_w = filter_.shape assert in_c == f_c @@ -29,6 +27,7 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param): j1, j2 = j * stride[0], j * stride[0] + f_w out[n, k, i1:i2, j1:j2] += tmp_out + out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]] return out @@ -36,8 +35,6 @@ class TestConv2dTransposeOp(OpTest): def setUp(self): # init as conv transpose self.init_op_type() - - # [2, 3, 5, 5] -> kernel [3, 6, 3, 3] -> output [2, 6, 7, 7] self.init_test_case() conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad} @@ -55,7 +52,6 @@ class TestConv2dTransposeOp(OpTest): self.outputs = {'Output': output} def test_check_output(self): - print 'check output here for', self.op_type self.check_output() def test_check_grad_no_input(self): @@ -88,6 +84,26 @@ class TestConv2dTransposeOp(OpTest): self.op_type = "conv2d_transpose" +class TestWithPad(TestConv2dTransposeOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.dilations = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + +class TestWithStride(TestConv2dTransposeOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.dilations = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + # ------------ test_cudnn ------------ class TestCudnn(TestConv2dTransposeOp): def init_op_type(self): diff --git a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py index 132fe7931438a30cf02e4ad2894c0838e48ffc9f..59a32c40821f2109306e898a6a798fea52b1e0ca 100644 --- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py +++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py @@ -4,9 +4,7 @@ from op_test import OpTest def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param): - # [2, 3, 5, 5, 5] in_n, in_c, in_d, in_h, in_w = input_.shape - # [3, 6, 3, 3, 3] f_c, out_c, f_d, f_h, f_w = filter_.shape assert in_c == f_c @@ -14,7 +12,6 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param): out_d = (in_d - 1) * stride[0] + f_d out_h = (in_h - 1) * stride[1] + f_h out_w = (in_w - 1) * stride[2] + f_w - out = np.zeros((in_n, out_c, out_d, out_h, out_w)) for n in range(in_n): @@ -33,6 +30,8 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param): j1, j2 = j * stride[2], j * stride[2] + f_w out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out + out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w - + pad[2]] return out @@ -40,8 +39,6 @@ class TestConv3dTransposeOp(OpTest): def setUp(self): # init as conv transpose self.init_op_type() - - # [2, 3, 5, 5, 5] -> kernel [3, 6, 3, 3, 3] -> output [2, 6, 7, 7, 7] self.init_test_case() conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad} @@ -49,7 +46,6 @@ class TestConv3dTransposeOp(OpTest): filter_ = np.random.random(self.filter_size).astype("float32") output = conv3dtranspose_forward_naive( input_, filter_, conv3dtranspose_param).astype("float32") - # print 'deconv output py', output, output.shape self.inputs = {'Input': input_, 'Filter': filter_} self.attrs = { @@ -60,7 +56,6 @@ class TestConv3dTransposeOp(OpTest): self.outputs = {'Output': output} def test_check_output(self): - print 'check output here' self.check_output() def test_check_grad(self): @@ -85,7 +80,7 @@ class TestConv3dTransposeOp(OpTest): self.pad = [0, 0, 0] self.stride = [1, 1, 1] self.dilations = [1, 1, 1] - self.input_size = [2, 3, 5, 5, 5] # NCHW + self.input_size = [2, 3, 5, 5, 5] # NCDHW f_c = self.input_size[1] self.filter_size = [f_c, 6, 3, 3, 3] @@ -93,5 +88,25 @@ class TestConv3dTransposeOp(OpTest): self.op_type = "conv3d_transpose" +class TestWithPad(TestConv3dTransposeOp): + def init_test_case(self): + self.pad = [1, 1, 1] + self.stride = [1, 1, 1] + self.dilations = [1, 1, 1] + self.input_size = [2, 3, 5, 5, 5] # NCDHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3, 3] + + +class TestWithStride(TestConv3dTransposeOp): + def init_test_case(self): + self.pad = [1, 1, 1] + self.stride = [2, 2, 2] + self.dilations = [1, 1, 1] + self.input_size = [2, 3, 5, 5, 5] # NCDHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3, 3] + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_is_empty_op.py b/python/paddle/v2/fluid/tests/test_is_empty_op.py new file mode 100644 index 0000000000000000000000000000000000000000..ed6e3fe24f6333c9c90d760787eb13241a7e1868 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_is_empty_op.py @@ -0,0 +1,43 @@ +import unittest +import numpy as np +from paddle.v2.fluid.op import Operator +import paddle.v2.fluid.core as core + + +def create_tensor(scope, name, np_data): + tensor = scope.var(name).get_tensor() + tensor.set_dims(np_data.shape) + tensor.set(np_data, core.CPUPlace()) + return tensor + + +class TestIsEmptyOp(unittest.TestCase): + def setUp(self): + self.scope = core.Scope() + # create input variables + np_data0 = np.array([0, 1, 2]) + create_tensor(self.scope, "X0", np_data0) + + np_data1 = np.array([1]) + t = create_tensor(self.scope, "X1", np_data1) + t.set_dims([0]) + + # create output variables + self.scope.var("out") + + def test_no_empty(self): + self.one_case("X0", False) + + def test_empty(self): + self.one_case("X1", True) + + def one_case(self, input, target): + op = Operator(type="is_empty", X=input, Out="out") + ctx = core.DeviceContext.create(core.CPUPlace()) + op.run(self.scope, ctx) + out = self.scope.var("out").get_tensor() + self.assertEqual(np.array(out)[0], target) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py index 0f01acb3b94dc55a3536e751108e785ddc6e47bb..84b432333f950f754a97bc1a051b59c16fb22aed 100644 --- a/python/paddle/v2/fluid/tests/test_while_op.py +++ b/python/paddle/v2/fluid/tests/test_while_op.py @@ -2,6 +2,7 @@ import unittest import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor import paddle.v2.fluid.core as core +from paddle.v2.fluid.backward import append_backward_ops import numpy @@ -16,7 +17,7 @@ class TestWhileOp(unittest.TestCase): i = layers.zeros(shape=[1], dtype='int64') i.stop_gradient = True init = layers.zeros(shape=[10], dtype='float32') - mem_array = layers.array_write(init, i=i) + mem_array = layers.array_write(x=init, i=i) data_array = layers.array_write(x=d0, i=i) i = layers.increment(i) @@ -29,17 +30,23 @@ class TestWhileOp(unittest.TestCase): i.stop_gradient = True array_len = layers.fill_constant(shape=[1], dtype='int64', value=3) + array_len.stop_gradient = True cond = layers.less_than(x=i, y=array_len) while_op = layers.While(cond=cond) with while_op.block(): d = layers.array_read(array=data_array, i=i) prev = layers.array_read(array=mem_array, i=i) - i = layers.increment(x=i, in_place=True) result = layers.sums(input=[d, prev]) + + i = layers.increment(x=i, in_place=True) layers.array_write(result, i=i, array=mem_array) layers.less_than(x=i, y=array_len, cond=cond) - sum_result = layers.array_read(mem_array, i=array_len) + + sum_result = layers.array_read(array=mem_array, i=i) + loss = layers.mean(x=sum_result) + + append_backward_ops(loss) cpu = core.CPUPlace() exe = Executor(cpu)