diff --git a/CMakeLists.txt b/CMakeLists.txt
index fd3582a1bca199d62d19550ffdd1efe9db520fa7..9e30dff70fed51b604059610b22057349f22db58 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,8 +36,7 @@ include(simd)
################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
-option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND})
-option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND})
+option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
@@ -82,10 +81,8 @@ if(ANDROID OR IOS)
"Disable PYTHON when cross-compiling for Android and iOS" FORCE)
set(WITH_RDMA OFF CACHE STRING
"Disable RDMA when cross-compiling for Android and iOS" FORCE)
- set(WITH_MKLDNN OFF CACHE STRING
- "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
- set(WITH_MKLML OFF CACHE STRING
- "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
+ set(WITH_MKL OFF CACHE STRING
+ "Disable MKL when cross-compiling for Android and iOS" FORCE)
# Compile PaddlePaddle mobile inference library
if (NOT WITH_C_API)
@@ -111,6 +108,17 @@ else()
set(THIRD_PARTY_BUILD_TYPE Release)
endif()
+if(WITH_MKL)
+ set(WITH_MKLML ON)
+ set(WITH_MKLDNN ${AVX2_FOUND})
+ if(NOT WITH_MKLDNN)
+ message(WARNING "Do not have AVX2 intrinsics and disabled MKL-DNN")
+ endif()
+else()
+ set(WITH_MKLML OFF)
+ set(WITH_MKLDNN OFF)
+endif()
+
########################################################################################
include(external/mklml) # download mklml package
@@ -164,8 +172,12 @@ if(WITH_GPU)
endif(NOT WITH_DSO)
endif(WITH_GPU)
+if(WITH_MKLML)
+ list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
+endif()
+
if(WITH_MKLDNN)
- list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
+ list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif()
if(USE_NNPACK)
diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn.sh
index a4527e04968cf8c8c3c31d16f50bc3e28381f6d8..3cc779b48d082985f75ab1c053fbe262bc6d58aa 100755
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@@ -1,9 +1,7 @@
set -e
function train() {
- unset OMP_NUM_THREADS MKL_NUM_THREADS
- export OMP_DYNAMIC="FALSE"
- export KMP_AFFINITY="granularity=fine,compact,0,0"
+ unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
topology=$1
layer_num=$2
bs=$3
@@ -14,8 +12,6 @@ function train() {
elif [ $4 == "False" ]; then
thread=`nproc`
# each trainer_count use only 1 core to avoid conflict
- export OMP_NUM_THREADS=1
- export MKL_NUM_THREADS=1
log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
else
echo "Wrong input $3, use True or False."
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 24ddb24399dabeec9b8e5faf36be3eb21f420111..e550ec285668ea25757eeee9e7c5dc48fc9d339d 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -76,27 +76,14 @@ else()
include_directories(${CUDA_TOOLKIT_INCLUDE})
endif(NOT WITH_GPU)
-if(WITH_MKLDNN)
- add_definitions(-DPADDLE_USE_MKLDNN)
- if (WITH_MKLML AND MKLDNN_IOMP_DIR)
- message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
- set(OPENMP_FLAGS "-fopenmp")
- set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
- set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
- else()
- find_package(OpenMP)
- if(OPENMP_FOUND)
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
- else()
- message(WARNING "Can not find OpenMP."
- "Some performance features in MKLDNN may not be available")
- endif()
- endif()
-
-endif(WITH_MKLDNN)
+if (WITH_MKLML AND MKLML_IOMP_LIB)
+ message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
+ set(OPENMP_FLAGS "-fopenmp")
+ set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+ set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
+endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
index 310450f7d009dc0cdae9c0079a96445af8ec8f95..d3f5bf6852b3b295f3b5806b0577a880b0ce6ba6 100644
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
# Set the architecture for iOS
if(NOT DEFINED IOS_ARCH)
if(IOS_PLATFORM STREQUAL "OS")
- # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
- set(IOS_ARCH "arm64")
+ set(IOS_ARCH "armv7;armv7s;arm64")
elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
- # FIXME(liuyiqun): support "i386;x86_64" future
- set(IOS_ARCH "x86_64")
+ set(IOS_ARCH "i386;x86_64")
endif()
endif()
set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
@@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_
# Hidden visibilty is required for cxx on iOS
set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
-set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
+set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 5a06825beb73e85d8a55b7b578b187bee2c4340c..fc52d339d7a336b44c97f2e0a9fc8d6604854365 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
SET(MKLDNN_DEPENDS ${MKLML_PROJECT})
- SET(MKLDNN_MKLROOT ${MKLML_ROOT})
- SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB})
- SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR})
- MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
+ MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
+ELSE()
+ MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
ENDIF()
SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
@@ -57,15 +56,16 @@ ExternalProject_Add(
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
- CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT}
+ CMAKE_ARGS -DMKLROOT=${MKLML_ROOT}
CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
- -DMKLROOT:PATH=${MKLDNN_MKLROOT}
+ -DMKLROOT:PATH=${MKLML_ROOT}
)
ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
+MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
+add_definitions(-DPADDLE_USE_MKLDNN)
LIST(APPEND external_project_dependencies mkldnn)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 324e29f931ecbb6beab2d363daa01a19b1a56b3e..4c4f59656dae68739f2f07f3febd510e727fe2dd 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND})
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
- SET(OPENBLAS_CC "${CMAKE_C_COMPILER}")
+ SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
IF(CMAKE_CROSSCOMPILING)
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND})
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
ENDIF()
ELSEIF(IOS)
- # FIXME(liuyiqun): support multiple architectures
- SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
- SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
- IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
- SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
- SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
- ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+ IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+ SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
+ SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
+ ELSE()
+ MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
+ "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
ENDIF()
ELSEIF(RPI)
# use hardfp
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 8bd058222880b4df3b08da09c02f9fe7f1d0ee66..a8e1aca49c97df256b1269c286b0bce7732fa932 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+IF(MOBILE_INFERENCE)
+ return()
+ENDIF()
+
INCLUDE(ExternalProject)
SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 117ab7f49cdf4a568cd203b2b17767643d0b2d50..ad905ab55ba3537054fa5b30b5fca4d83c406702 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME)
target_link_libraries(${TARGET_NAME} log)
endif(ANDROID)
- if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
- target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+ if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
+ target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
endif()
add_dependencies(${TARGET_NAME} ${external_project_dependencies})
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 3bb52707974d633d7933f16592354455a5d451cc..d4d182f6692e09b3e40f3620b77d9a0f20ec5af3 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -335,6 +335,16 @@ bilinear_interp
.. autoclass:: paddle.v2.layer.bilinear_interp
:noindex:
+dot_prod
+---------
+.. autoclass:: paddle.v2.layer.dot_prod
+ :noindex:
+
+out_prod
+--------
+.. autoclass:: paddle.v2.layer.out_prod
+ :noindex:
+
power
-----
.. autoclass:: paddle.v2.layer.power
diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD
index 16236763a73770f3fe5eadf67645765d0456f875..ec6d4681836e189f46dbb9b915a237dc15cda7cf 100644
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA.
我们把集成方案大致分为了如下几个方面。
### CMake
-我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项,当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
+我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关,他是负责`WITH_MKLML`和`WITH_MKLDNN`的总开关。
-同时,我们会引入`WITH_MKLML`选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。
+当打开`WITH_MKL`时,会开启MKLML的功能,作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上,同时会开启MKL-DNN功能。
-所以,我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。
+当关闭`WITH_MKL`时,MKLML和MKL-DNN功能会同时关闭。
-**备注**:当`WITH_MKLML=ON`的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动`cmake/cblas.cmake`中的逻辑。
+所以,我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。
### Layers
所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
index 731a63f945c29ba78538b3d71289b234e569354d..61f3a223547b352cf7929615cf3682b29b9a738f 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -34,7 +34,7 @@ PaddlePaddle的文档构建有两种方式。
cd TO_YOUR_PADDLE_CLONE_PATH
mkdir -p build
cd build
- cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+ cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
make gen_proto_py
make paddle_docs paddle_docs_cn
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
index 882066f23714f7ab3bba9199b5fa5ff2325ce849..424d7718c64438496cf0895397babd5408e1ca02 100644
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -1,4 +1,4 @@
-# 构建Android平台上的PaddlePaddle库
+# Android平台编译指南
用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库:
- 基于Docker容器的编译方式
diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md
index cda636a67de712e072f4cc7ad859dda75211eaa8..9da48e7f2119ce901fbb3abab73400df27be16d2 100644
--- a/doc/mobile/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
@@ -1,4 +1,4 @@
-# 构建iOS平台上的PaddlePaddle库
+# iOS平台编译指南
交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。
## 准备交叉编译环境
@@ -25,7 +25,7 @@ iOS平台可选配置参数:
- `IOS_PLATFORM`,可设置为`OS/SIMULATOR`,默认值为`OS`。
- `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。
- `SIMULATOR`,构建目标为`x86`架构的模拟器平台。
-- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示:
+- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示,默认编译所有架构:
@@ -41,11 +41,11 @@ iOS平台可选配置参数:
OS |
- armv7, armv7s, arm64 (默认) |
+ armv7, armv7s, arm64 |
SIMULATOR |
- i386, x86_64 (默认) |
+ i386, x86_64 |
@@ -66,7 +66,7 @@ iOS平台可选配置参数:
```bash
cmake -DCMAKE_SYSTEM_NAME=iOS \
-DIOS_PLATFORM=OS \
- -DIOS_ARCH="arm64" \
+ -DIOS_ARCH="armv7;arm64" \
-DIOS_ENABLE_BITCODE=ON \
-DIOS_USE_VECLIB_FOR_BLAS=ON \
-DCMAKE_INSTALL_PREFIX=your/path/to/install \
@@ -112,6 +112,6 @@ $ make install
- `lib`目录,其中包含PaddlePaddle的C-API静态库
- `third_party`目录,其中包含所依赖的所有第三方库
-注意,不同架构的PaddlePaddle库建议安装到不同的目录下,然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。
+注意,如果PaddlePaddle库需要同时支持真机和模拟器,则需要分别编译真机和模拟器版本,然后使用`lipo`工具合并fat库。
自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。
diff --git a/doc/mobile/cross_compiling_for_raspberry_cn.md b/doc/mobile/cross_compiling_for_raspberry_cn.md
index 6e983645faaed1f67edaeeb82ddbef9cef6bb85f..f8ef9dc8031613831437745995268f3abc392f5b 100644
--- a/doc/mobile/cross_compiling_for_raspberry_cn.md
+++ b/doc/mobile/cross_compiling_for_raspberry_cn.md
@@ -1,4 +1,4 @@
-# 构建Raspberry Pi平台上的PaddlePaddle库
+# Raspberry Pi平台编译指南
通常有两个方法来构建基于 Rasspberry Pi 的版本:
diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
index 78c43949dfe325d0e1a6ba10ae51cb7b858f6c52..bb8249a5511c089ec2f2263ff4cc290f0a5a8fce 100644
--- a/paddle/capi/Main.cpp
+++ b/paddle/capi/Main.cpp
@@ -29,6 +29,9 @@ static void initPaddle(int argc, char** argv) {
extern "C" {
paddle_error paddle_init(int argc, char** argv) {
+ static bool isInit = false;
+ if (isInit) return kPD_NO_ERROR;
+
std::vector realArgv;
realArgv.reserve(argc + 1);
realArgv.push_back(strdup(""));
@@ -37,6 +40,7 @@ paddle_error paddle_init(int argc, char** argv) {
}
initPaddle(argc + 1, realArgv.data());
free(realArgv[0]);
+ isInit = true;
return kPD_NO_ERROR;
}
}
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index ede2670882ee2b93f610a2261a4ecc1784bc2d0c..4ab8de80d1c7be0f8e3eb848955373dd5e21bc18 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -25,7 +25,9 @@ limitations under the License. */
#include "hl_matrix.h"
#include "hl_sequence.h"
#include "hl_sparse.h"
+#ifndef PADDLE_MOBILE_INFERENCE
#include "hl_warpctc_wrap.h"
+#endif
#ifdef HPPL_STUB_FUNC
#include "stub/hl_aggregate_stub.h"
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index b3b9c45ded95ce2e735b8898d47760956dcacdce..00d9dd238ec5328be28f58f8118daad3a039e08c 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -270,6 +270,19 @@ static bool AllGradInSet(const std::vector& names,
return false;
}
}
+ if (VLOG_IS_ON(10)) {
+ std::ostringstream sout;
+ sout << "All input {";
+ for (auto& name : names) {
+ sout << name << ",";
+ }
+ sout << "} is in {";
+ for (auto& name : set) {
+ sout << name << ",";
+ }
+ sout << "}";
+ VLOG(10) << sout.str();
+ }
return true;
}
@@ -290,14 +303,12 @@ static void CreateGradVarInBlock(
auto ops = block_desc->AllOps();
for (size_t op_index = grad_op_start_index; op_index < ops.size();
++op_index) {
- bool need_infer_shape = false;
std::unordered_set new_vars;
ForEachVarName(ops[op_index]->Outputs(),
[&](const std::string& grad_var_name) {
if (block_desc->HasVar(grad_var_name)) {
return false;
}
- need_infer_shape = true;
auto var = block_desc->Var(grad_var_name);
new_vars.insert(var->Name());
auto it = param_name_map.find(grad_var_name);
@@ -311,23 +322,21 @@ static void CreateGradVarInBlock(
grad_record.op_idx_ = static_cast(op_index);
return false; /* not break */
});
- if (need_infer_shape) {
- ops[op_index]->InferVarType(block_desc);
- for (auto& arg : ops[op_index]->OutputArgumentNames()) {
- if (new_vars.find(arg) == new_vars.end()) {
- continue;
- }
- auto pname = FwdName(arg);
- auto* param = block_desc->FindVarRecursive(pname);
- auto* grad = block_desc->FindVar(arg);
- if (param == nullptr) {
- grad->SetDataType(DataType::FP32);
- } else {
- grad->SetDataType(param->GetDataType());
- }
+ ops[op_index]->InferVarType(block_desc);
+ for (auto& arg : ops[op_index]->OutputArgumentNames()) {
+ if (new_vars.find(arg) == new_vars.end()) {
+ continue;
+ }
+ auto pname = FwdName(arg);
+ auto* param = block_desc->FindVarRecursive(pname);
+ auto* grad = block_desc->FindVar(arg);
+ if (param == nullptr) {
+ grad->SetDataType(DataType::FP32);
+ } else {
+ grad->SetDataType(param->GetDataType());
}
- ops[op_index]->InferShape(*block_desc);
}
+ ops[op_index]->InferShape(*block_desc);
}
}
@@ -387,6 +396,7 @@ std::vector> MakeBlockBackward(
ProgramDescBind& program_desc, int block_idx,
std::unordered_set* no_grad_vars,
std::unordered_map* grad_to_var) {
+ VLOG(5) << "MakeBlockBackward";
BlockDescBind* cur_block = program_desc.MutableBlock(block_idx);
std::vector op_descs = cur_block->AllOps();
std::unordered_map> dup_out_ops;
@@ -394,9 +404,10 @@ std::vector> MakeBlockBackward(
std::vector> backward_descs;
for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
+ VLOG(5) << "Making backward " << (*it)->Type() << " op";
std::vector> op_grads;
- if ((*it)->Type() == "recurrent") {
+ if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
int step_block_idx = (*it)->GetBlockAttr("step_block");
BlockDescBind* backward_block = CreateStepBlock(
program_desc, no_grad_vars, grad_to_var, step_block_idx);
@@ -410,6 +421,15 @@ std::vector> MakeBlockBackward(
op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
}
+ if (VLOG_IS_ON(10)) {
+ std::ostringstream sout;
+ sout << "Made ";
+ for (auto& op_grad : op_grads) {
+ sout << op_grad->Type() << " ";
+ }
+ VLOG(10) << sout.str();
+ }
+
for (const auto& desc : op_grads) {
for (const std::string& out_name : desc->OutputArgumentNames()) {
if (out_name.find("@GRAD") == std::string::npos) {
@@ -425,6 +445,8 @@ std::vector> MakeBlockBackward(
op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
[](std::unique_ptr& ptr) { return std::move(ptr); });
}
+
+ VLOG(5) << "Appending Sums";
// Check whether some variables are written more than once
std::list>> pending_sum_ops;
for (const auto& dup : dup_out_ops) {
@@ -432,16 +454,22 @@ std::vector> MakeBlockBackward(
const std::vector dup_op = dup.second;
if (out_name != kEmptyVarName && dup_op.size() > 1) {
std::vector sum_op_inputs;
+ std::string next_g_name = out_name;
for (size_t i = 0; i < dup_op.size(); ++i) {
+ VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
+ << " duplicated";
std::string new_name = out_name + "@RENAME@" + std::to_string(i);
- backward_descs[dup_op[i]]->Rename(out_name, new_name);
+ backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
+ backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
sum_op_inputs.emplace_back(new_name);
+ next_g_name = sum_op_inputs.back();
}
std::unique_ptr sum_op(new OpDescBind(
"sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
}
}
+
pending_sum_ops.sort(
[](const std::pair>& a,
const std::pair>& b) {
@@ -452,6 +480,8 @@ std::vector> MakeBlockBackward(
std::move(p.second));
}
+ VLOG(5) << "MakeBlockBackward Finished";
+
return backward_descs;
}
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index 3ec88d7a72c3339bf5e7d0ca3957a3f608f039b7..be144d8fc0104fccc08006532a85906ade25c2a1 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -29,6 +29,8 @@ inline DataType ToDataType(std::type_index type) {
return DataType::INT32;
} else if (typeid(int64_t).hash_code() == type.hash_code()) {
return DataType::INT64;
+ } else if (typeid(bool).hash_code() == type.hash_code()) {
+ return DataType::BOOL;
} else {
PADDLE_THROW("Not supported");
}
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 53b899a23997b71e723a298ec360a4e018d89878..8b6f42b82df14bfcd25f33ef16b5903fb965a8ba 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -60,8 +60,7 @@ void make_ddim(DDim& ddim, const int64_t* dims, int n) {
ddim = make_dim<9>(dims);
break;
default:
- throw std::invalid_argument(
- "Dynamic dimensions must have between [1, 9] dimensions.");
+ PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
}
}
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 2fcf41d69f0011b0d9a3d89c97fcebacb0703e97..adedd8cb0e8504fd6fc924e62a2ede3c1c7ce698 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -120,6 +120,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
+ VLOG(10) << op->DebugString();
op->Run(*local_scope, *device);
}
if (create_local_scope) {
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 39c8def82e1ebb10a0e357a648af760099020c32..48cd131550dea5ad3f368b25c31d753efbe0dff9 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -235,6 +235,23 @@ void OpDescBind::Rename(const std::string &old_name,
need_update_ = true;
}
+void OpDescBind::RenameOutput(const std::string &old_name,
+ const std::string &new_name) {
+ for (auto &output : outputs_) {
+ std::replace(output.second.begin(), output.second.end(), old_name,
+ new_name);
+ }
+ need_update_ = true;
+}
+
+void OpDescBind::RenameInput(const std::string &old_name,
+ const std::string &new_name) {
+ for (auto &input : inputs_) {
+ std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+ }
+ need_update_ = true;
+}
+
struct SetAttrDescVisitor : public boost::static_visitor {
explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
mutable OpDesc::Attr *attr_;
@@ -448,7 +465,12 @@ const std::vector &CompileTimeInferShapeContext::Outputs(
DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
- return framework::make_ddim(var->Shape());
+ try {
+ return framework::make_ddim(var->Shape());
+ } catch (...) {
+ VLOG(5) << "GetDim of variable " << name << " error";
+ std::rethrow_exception(std::current_exception());
+ }
}
void CompileTimeInferShapeContext::SetDim(const std::string &name,
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index e3e96441bbf51729f2ba69c9257e6961b1de0d5c..da032319afa775571d3942bf6ae415db7d233735 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -73,6 +73,10 @@ class OpDescBind {
void Rename(const std::string &old_name, const std::string &new_name);
+ void RenameOutput(const std::string &old_name, const std::string &new_name);
+
+ void RenameInput(const std::string &old_name, const std::string &new_name);
+
// Only be used in C++
const AttributeMap &GetAttrMap() const;
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 3276f8af396fe58450a8dc6713fe61e49d5ca708..93467ab8ac796277b47a861a427de2837fb2d3d4 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -403,19 +403,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
void OperatorWithKernel::Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const {
- if (VLOG_IS_ON(1)) {
- auto inputs = this->InputVars();
- auto outputs = this->OutputVars(true);
- std::ostringstream sout;
- sout << "Run operator " << this->Type() << " From [";
- std::ostream_iterator out_it(sout, ",");
- std::copy(inputs.begin(), inputs.end(), out_it);
- sout << "] to [";
- std::copy(outputs.begin(), outputs.end(), out_it);
- sout << "]";
- VLOG(1) << sout.str();
- }
-
RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx);
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 9428b8a07ea0af005f6e960ddaa02da624ad9d97..9ad6272c99dd6a85520ae44c1331ac232bc6a9a2 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -38,11 +38,12 @@ Scope& Scope::NewScope() const {
Variable* Scope::Var(const std::string& name) {
auto iter = vars_.find(name);
if (iter != vars_.end()) {
+ VLOG(3) << "Get existing variable " << name;
return iter->second;
}
Variable* v = new Variable();
vars_[name] = v;
- VLOG(3) << "Create variable " << name << " on scope";
+ VLOG(3) << "Create variable " << name;
v->name_ = &(vars_.find(name)->first);
return v;
}
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index 7d36ead2ca85328c7843b3b5d423cf8e921d1c93..05dc47f06ac81f0acb6d0317cbecb3009c7dd7f0 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -53,6 +53,10 @@ class InferShapeContext {
virtual bool IsRuntime() const = 0;
+ // Note: In while op, we need this to be public
+ void SetDims(const std::vector &names,
+ const std::vector &dims);
+
protected:
virtual framework::DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
@@ -60,9 +64,6 @@ class InferShapeContext {
std::vector GetDims(
const std::vector &names) const;
- void SetDims(const std::vector &names,
- const std::vector &dims);
-
std::vector GetVarTypes(
const std::vector &names) const;
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 91d732641a4a5eed050841b59fd10da397eb732f..41ead3c5ecef248830cfb0f8be360f21dcd58e7b 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -73,7 +73,6 @@ if(MOBILE_INFERENCE)
list(REMOVE_ITEM GSERVER_SOURCES
dataproviders/DataProvider.cpp
dataproviders/MultiDataProvider.cpp
- dataproviders/ProtoDataProvider.cpp
dataproviders/PyDataProvider2.cpp
dataproviders/PyDataProvider.cpp)
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index 0478256f9cd81f4a99eb0cbcbd1a5a21de5cf14b..106cf5b6228e636026ded558d0f591022f1ae586 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
#include
#include
-#include "ProtoDataProvider.h"
#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h"
@@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config,
REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
-REGISTER_DATA_PROVIDER(proto, ProtoDataProvider);
-REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider);
int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
deleted file mode 100644
index c6f5cab1915b7f41d505c37a7fef762a392bad7f..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ /dev/null
@@ -1,932 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ProtoDataProvider.h"
-#include
-#include
-#include
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-#include "DataProviderGroup.h"
-#include "paddle/utils/Logging.h"
-
-DEFINE_double(memory_threshold_on_load_data,
- 1.0,
- "stop loading data when memory is not sufficient");
-
-namespace paddle {
-
-REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup);
-REGISTER_DATA_PROVIDER(proto_sequence_group,
- DataProviderGroup);
-
-ProtoDataProvider::ProtoDataProvider(const DataConfig& config,
- bool useGpu,
- bool loadDataAll)
- : DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
- if (loadDataAll) {
- loadData(config_.files());
- }
-}
-
-void ProtoDataProvider::loadData(const std::vector& fileList) {
- for (auto& file : fileList) {
- if (FLAGS_memory_threshold_on_load_data < 1.0) {
- double memUsage = getMemoryUsage();
- if (memUsage > FLAGS_memory_threshold_on_load_data) {
- LOG(INFO) << "memUsage is " << memUsage << ", > "
- << FLAGS_memory_threshold_on_load_data
- << " therefore SKIP ALL REMAINING file.";
- break;
- }
- }
- LOG(INFO) << "load data file " << file;
- loadDataFile(file);
- }
-
- if (sequenceStartPositions_.size() == sampleNums_) {
- // This means that each sample is one sequence
- shuffledSequenceIds_.swap(sequenceStartPositions_);
- } else {
- sequenceStartPositions_.push_back(sampleNums_);
- shuffledSequenceIds_.reserve(sequenceStartPositions_.size() - 1);
- for (size_t i = 0; i < sequenceStartPositions_.size() - 1; ++i) {
- shuffledSequenceIds_.push_back(i);
- }
- }
-
- LOG(INFO) << "read done, num of instance=" << sampleNums_;
- showDataStats();
-}
-
-void ProtoDataProvider::loadData(const std::string& fileName) {
- std::vector fileList;
- loadFileList(fileName, fileList);
- loadData(fileList);
-}
-
-void ProtoDataProvider::checkDataHeader(const DataHeader& header) {
- if (header_.slot_defs_size()) {
- // header_ is already set. Need to check consistency.
- CHECK_EQ(header_.slot_defs_size(), header.slot_defs_size())
- << "Different header";
- for (int i = 0; i < header.slot_defs_size(); ++i) {
- CHECK_EQ(header_.slot_defs(i).type(), header.slot_defs(i).type());
- CHECK_EQ(header_.slot_defs(i).dim(), header.slot_defs(i).dim());
- }
- return;
- }
-
- // header_ is not set before
- CHECK(header.slot_defs_size()) << "Invalid header: no slot is defined";
- int i;
- for (i = 0; i < header.slot_defs_size(); ++i) {
- if (header.slot_defs(i).type() == SlotDef::INDEX ||
- header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX) {
- break;
- }
- constexpr int kBufLen = 100;
- char buf[kBufLen];
- snprintf(buf, kBufLen, "slot%d_nnz", i);
- nnzStats_.push_back(getStat(buf));
- }
- numVecSlots_ = i;
-
- // Check that INDEX slots are after VECTOR slots
- for (int i = numVecSlots_; i < header.slot_defs_size(); ++i) {
- CHECK(header.slot_defs(i).type() == SlotDef::INDEX ||
- header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX);
- }
-
- slots_.clear();
- slots_.reserve(header.slot_defs_size());
- for (int i = 0; i < header.slot_defs_size(); ++i) {
- slots_.emplace_back();
- slots_.back().type = header.slot_defs(i).type();
- slots_.back().dim = header.slot_defs(i).dim();
- if (SlotDef::VECTOR_SPARSE_NON_VALUE == header.slot_defs(i).type() ||
- SlotDef::VECTOR_SPARSE_VALUE == header.slot_defs(i).type()) {
- slots_.back().indices.push_back(0);
- }
- }
-
- header_ = header;
-}
-
-void ProtoDataProvider::checkSample(const DataSample& sample) {
- CHECK_EQ(numVecSlots_, sample.vector_slots_size());
- CHECK(header_.slot_defs_size() == numVecSlots_ + sample.id_slots_size() ||
- header_.slot_defs_size() == numVecSlots_ + sample.var_id_slots_size());
- for (int i = 0; i < numVecSlots_; ++i) {
- uint32_t dim = header_.slot_defs(i).dim();
- switch (header_.slot_defs(i).type()) {
- case SlotDef::VECTOR_DENSE: {
- CHECK_EQ(static_cast(dim), sample.vector_slots(i).values_size());
- CHECK_EQ(0, sample.vector_slots(i).ids_size());
- break;
- }
- case SlotDef::VECTOR_SPARSE_NON_VALUE: {
- if (0 == sample.vector_slots(i).ids_size()) {
- break;
- }
- CHECK_LT(0, sample.vector_slots(i).ids_size());
- CHECK_EQ(0, sample.vector_slots(i).values_size());
- auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
- sample.vector_slots(i).ids().end());
- CHECK_GT(dim, maxId);
- break;
- }
- case SlotDef::VECTOR_SPARSE_VALUE: {
- if (0 == sample.vector_slots(i).ids_size()) {
- CHECK_EQ(0, sample.vector_slots(i).values_size());
- break;
- }
- CHECK_LT(0, sample.vector_slots(i).values_size());
- CHECK_GE(static_cast(dim), sample.vector_slots(i).values_size());
- CHECK_EQ(sample.vector_slots(i).values_size(),
- sample.vector_slots(i).ids_size());
- auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
- sample.vector_slots(i).ids().end());
- CHECK_GT(dim, maxId);
- break;
- }
- case SlotDef::VAR_MDIM_DENSE: {
- if (static_cast(dim) != 0) {
- CHECK_EQ(static_cast(dim), sample.vector_slots(i).values_size());
- if (sample.vector_slots(i).dims_size() != 0) {
- int totalDim = sample.vector_slots(i).dims(0);
- for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
- totalDim *= sample.vector_slots(i).dims(j);
- }
- CHECK_EQ(static_cast(dim), totalDim);
- }
- } else {
- CHECK_NE(sample.vector_slots(i).dims_size(), 0);
- int totalDim = sample.vector_slots(i).dims(0);
- for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
- totalDim *= sample.vector_slots(i).dims(j);
- }
- CHECK_EQ(totalDim, sample.vector_slots(i).values_size());
- }
- break;
- }
- case SlotDef::STRING: {
- CHECK_EQ(static_cast(1), sample.vector_slots(i).strs_size());
- CHECK_EQ(0, sample.vector_slots(i).ids_size());
- CHECK_EQ(0, sample.vector_slots(i).values_size());
- break;
- }
- default:
- LOG(FATAL) << "BUG: Should not reach here";
- }
- }
- for (int i = numVecSlots_; i < header_.slot_defs_size(); ++i) {
- if (header_.slot_defs(i).type() != SlotDef::VAR_MDIM_INDEX) {
- uint32_t id = sample.id_slots(i - numVecSlots_);
- if (id == -1U) continue;
- CHECK_LT(id, header_.slot_defs(i).dim());
- } else {
- for (int j = 0; j < sample.var_id_slots(i - numVecSlots_).ids_size();
- ++j) {
- uint32_t id = sample.var_id_slots(i - numVecSlots_).ids(j);
- CHECK_LT(id, header_.slot_defs(i).dim());
- }
- }
- }
-}
-
-void ProtoDataProvider::loadDataFile(const std::string& fileName) {
- std::ifstream is(fileName);
- CHECK(is) << "Fail to open " << fileName;
- bool dataCompression = str::endsWith(fileName, ".gz");
- std::unique_ptr reader(new ProtoReader(&is, dataCompression));
- CHECK(reader) << "Fail to create proto data input stream";
-
- DataHeader header;
- CHECK(reader->read(&header));
- checkDataHeader(header);
-
- DataSample sample;
- do {
- if (!reader->read(&sample)) {
- break;
- }
- checkSample(sample);
- if (sample.is_beginning()) {
- sequenceStartPositions_.push_back(sampleNums_);
- }
- fillSlots(sample);
- ++sampleNums_;
- } while (true);
-
- CHECK(is.eof()) << "Fail to read file";
- reader.reset(nullptr);
- is.close();
-}
-
-// checkSample has done before, no check here
-void ProtoDataProvider::fillSlots(const DataSample& sample) {
- for (size_t i = 0; i < slots_.size(); ++i) {
- auto& slot = slots_[i];
- int dim = slot.dim;
- switch (slot.type) {
- case SlotDef::VECTOR_DENSE: {
- size_t oldSize = slot.denseData.size();
- slot.denseData.resize(oldSize + dim);
- const float* values = sample.vector_slots(i).values().data();
-#ifdef PADDLE_TYPE_DOUBLE
- std::copy(values, values + dim, slot.denseData.begin() + oldSize);
-#else
- memcpy(slot.denseData.data() + oldSize, values, sizeof(real) * dim);
-#endif
- break;
- }
- case SlotDef::VECTOR_SPARSE_NON_VALUE: {
- int slotSize = sample.vector_slots(i).ids_size();
- int subSlotSize = 0;
- int id = 0; // the slot id
- // find whether this vector_slots has subseq. If not has subseq,
- // subSlotSize = 0.
- for (id = 0; id < sample.subseq_slots_size(); id++) {
- if (sample.subseq_slots(id).slot_id() == i) {
- subSlotSize = sample.subseq_slots(id).lens_size();
- break;
- }
- }
- if (subSlotSize && slot.subIndices.size() == 0UL) {
- // If has subSeq, the first element of subIndices = 0.
- slot.subIndices.push_back(0);
- }
- if (slotSize == 0UL) {
- // if has no id, new indices = old indices.
- slot.indices.push_back(slot.indices.back());
- // if has subSeq, new subIndices = old subIndices.
- if (slot.subIndices.size()) {
- slot.subIndices.push_back(slot.subIndices.back());
- }
- break;
- }
- slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
- const unsigned int* ids = sample.vector_slots(i).ids().data();
- memcpy(slot.sparseNonValueData.data() + slot.indices.back(),
- ids,
- sizeof(*ids) * slotSize);
- slot.indices.push_back(slot.indices.back() + slotSize);
- if (subSlotSize) {
- for (int ii = 0; ii < subSlotSize; ++ii) {
- slot.subIndices.push_back(slot.subIndices.back() +
- sample.subseq_slots(id).lens(ii));
- }
- }
- break;
- }
- case SlotDef::VECTOR_SPARSE_VALUE: {
- if (0 == sample.vector_slots(i).ids_size()) {
- slot.indices.push_back(slot.indices.back());
- break;
- }
- int slotSize = sample.vector_slots(i).ids_size();
- slot.sparseFloatValueData.resize(slot.indices.back() + slotSize);
- const unsigned int* ids = sample.vector_slots(i).ids().data();
- const float* values = sample.vector_slots(i).values().data();
- for (int ii = 0; ii < slotSize; ++ii) {
- slot.sparseFloatValueData[slot.indices.back() + ii].col = ids[ii];
- slot.sparseFloatValueData[slot.indices.back() + ii].value =
- values[ii];
- }
- slot.indices.push_back(slot.indices.back() + slotSize);
- break;
- }
- case SlotDef::INDEX: {
- slot.indexData.push_back(sample.id_slots(i - numVecSlots_));
- break;
- }
- case SlotDef::VAR_MDIM_DENSE: {
- size_t oldSize = slot.varDenseData.size();
- slot.varDenseData.resize(oldSize + 1);
- size_t varDim = sample.vector_slots(i).values_size();
- slot.varDenseData[oldSize].data.resize(varDim);
- const float* values = sample.vector_slots(i).values().data();
-#ifdef PADDLE_TYPE_DOUBLE
- std::copy(
- values, values + varDim, slot.varDenseData[oldSize].data.data());
-#else
- memcpy(slot.varDenseData[oldSize].data.data(),
- values,
- sizeof(real) * varDim);
-#endif
- slot.varDenseData[oldSize].dims.resize(
- sample.vector_slots(i).dims_size());
- memcpy(slot.varDenseData[oldSize].dims.data(),
- sample.vector_slots(i).dims().data(),
- sizeof(uint32_t) * sample.vector_slots(i).dims_size());
- break;
- }
- case SlotDef::VAR_MDIM_INDEX: {
- size_t oldSize = slot.varIndices.size();
- slot.varIndices.resize(oldSize + 1);
- size_t varDim = sample.var_id_slots(i - numVecSlots_).ids_size();
- slot.varIndices[oldSize].resize(varDim);
- memcpy(slot.varIndices[oldSize].data(),
- sample.var_id_slots(i - numVecSlots_).ids().data(),
- sizeof(uint32_t) * varDim);
- break;
- }
- case SlotDef::STRING: {
- slot.strData.push_back(sample.vector_slots(i).strs(0));
- break;
- }
- }
- }
-}
-
-void ProtoDataProvider::showDataStats() {
- std::ostringstream oss;
- for (size_t i = 0; i < slots_.size(); ++i) {
- auto& slot = slots_[i];
- if (slot.type == SlotDef::VECTOR_SPARSE_NON_VALUE) {
- size_t nnz = slot.sparseNonValueData.size();
- oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
- } else if (slot.type == SlotDef::VECTOR_SPARSE_VALUE) {
- size_t nnz = slot.sparseFloatValueData.size();
- oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
- }
- }
- LOG(INFO) << oss.str();
-}
-
-void ProtoDataProvider::reset() {
- currentSequenceIndex_ = 0;
- if (!skipShuffle_) {
- shuffle();
- }
-
- DataProvider::reset();
-}
-
-void ProtoDataProvider::shuffle() {
- std::shuffle(shuffledSequenceIds_.begin(),
- shuffledSequenceIds_.end(),
- ThreadLocalRandomEngine::get());
-}
-
-/*
- Loop through sequences starting from currentSequenceIndex_
- for at most size samples. For each sequence ranging from [begin, end),
- op(begin, end) will be called.
-
- return the number of sequences scanned
-*/
-template
-int64_t ProtoDataProvider::sequenceLoop(Op op, int64_t size) {
- int64_t sz = 0;
- size_t i;
- size_t sequenceCount = shuffledSequenceIds_.size();
- if (usageRatio_ < 1.0f) {
- sequenceCount = static_cast(sequenceCount * usageRatio_);
- }
- for (i = currentSequenceIndex_; i < sequenceCount; ++i) {
- size_t id = shuffledSequenceIds_[i];
- int64_t begin = sequenceStartPositions_[id];
- int64_t end = sequenceStartPositions_[id + 1];
- int64_t len = end - begin;
- if (sz + len > size && sz > 0) break;
- sz += len;
- op(begin, end);
- }
- return i - currentSequenceIndex_;
-}
-
-/*
- Loop through sequences starting from currentSequenceIndex_
- for at most size samples. For each sample of each sequence at position
- pos, op(pos) will be called.
-
- return the number of sequences scanned
-*/
-template
-int64_t ProtoDataProvider::sampleLoop(Op op, int64_t size) {
- if (iidData()) {
- size = std::min(sampleNums_ - currentSequenceIndex_, size);
- for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
- ++i) {
- size_t pos = shuffledSequenceIds_[i];
- op(pos);
- }
- return size;
- } else {
- auto f = [op](int64_t begin, int64_t end) {
- for (int64_t pos = begin; pos < end; ++pos) {
- op(pos);
- }
- };
- return sequenceLoop(f, size);
- }
-}
-
-/*
- Loop through sub-sequences starting from currentSequenceIndex_
- for at most size samples. For each sample of each sub-sequence at position
- pos, op(pos) will be called.
-
- return the number of sub-sequences scanned
-*/
-template
-int64_t ProtoDataProvider::subSampleLoop(Op op, int64_t size, int slot) {
- CHECK(iidData()) << "subSampleLoop only accepts iid data";
- size = std::min(sampleNums_ - currentSequenceIndex_, size);
- int subSize = 0;
- for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
- ++i) {
- size_t pos = shuffledSequenceIds_[i];
- int64_t* indexs = slots_[slot].indices.data();
- int64_t* subIndexs = slots_[slot].subIndices.data();
- int64_t subSeqStart = 0;
- int64_t subSeqEnd = 0;
- for (int j = 0; j < (int)slots_[slot].subIndices.size(); j++) {
- if (subIndexs[j] == indexs[pos]) {
- subSeqStart = j;
- if (subIndexs[pos] == subIndexs[pos + 1]) {
- subSeqEnd = j + 1;
- break;
- }
- } else if (subIndexs[j] == indexs[pos + 1]) {
- subSeqEnd = j;
- break;
- }
- }
- for (int j = subSeqStart; j < subSeqEnd; j++) {
- op(j);
- }
- subSize += subSeqEnd - subSeqStart;
- }
- return subSize;
-}
-
-int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
- DataBatch* batch) {
- int64_t numSequences = 0; // actual number of sequences in the batch
-
- // the number of sequences scanned, including those skipped because too long
- int64_t numScannedSeqs = 0;
- std::lock_guard guard(lock_);
- if (iidData()) {
- size = std::min(getSize() - currentSequenceIndex_, size);
- numScannedSeqs = numSequences = size;
- } else {
- int64_t sz = 0;
- auto op = [&sz, &numSequences](int64_t begin, int64_t end) {
- ++numSequences;
- sz += end - begin;
- };
- numScannedSeqs = sequenceLoop(op, size);
- VLOG_IF(1, numScannedSeqs > numSequences)
- << numScannedSeqs - numSequences
- << " sequences are skipped because longer than " << size;
- size = sz;
- }
- if (size <= 0) return 0;
-
- DataBatch& cpuBatch = *cpuBatch_;
- std::vector& cpuArguments = cpuBatch.getStreams();
- cpuBatch.setSize(size);
- cpuArguments.resize(header_.slot_defs_size());
-
- if (!iidData()) {
- ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
- numSequences + 1,
- /* useGpu= */ false);
- int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
- int pos = 0;
- int i = 0;
- auto op = [buf, &pos, &i](int64_t begin, int64_t end) {
- buf[i] = pos;
- pos += end - begin;
- ++i;
- };
- sequenceLoop(op, size);
- buf[i] = size;
- for (size_t slot = 1; slot < cpuArguments.size(); ++slot) {
- cpuArguments[slot].sequenceStartPositions =
- cpuArguments[0].sequenceStartPositions;
- }
- }
-
- for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
- size_t dim = header_.slot_defs(slot).dim();
- SlotDef::SlotType slotType = header_.slot_defs(slot).type();
-
- std::vector dataPos;
- dataPos.reserve(size);
- auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
- sampleLoop(op, size);
-
- switch (slotType) {
- case SlotDef::VECTOR_DENSE: {
- Matrix::resizeOrCreate(cpuArguments[slot].value,
- size,
- dim,
- false, // trans = false
- false); // useGpu = false
- real* buf = cpuArguments[slot].value->getData();
- for (int i = 0; i < size; ++i) {
- memcpy(buf + i * dim,
- slots_[slot].denseData.data() + dataPos[i] * dim,
- sizeof(real) * dim);
- }
- break;
- }
- case SlotDef::VECTOR_SPARSE_NON_VALUE: {
- if (!(cpuArguments[slot].value)) {
- cpuArguments[slot].value =
- Matrix::createSparseMatrix(size,
- dim,
- size /*DEFAULT_AVG_WIDTH = 1*/,
- NO_VALUE,
- SPARSE_CSR,
- false,
- useGpu_);
- }
- auto mat = cpuArguments[slot].value;
- mat->resize(size, dim);
- if (std::dynamic_pointer_cast(mat)) {
- std::dynamic_pointer_cast(mat)->copyFrom(
- dataPos.data(),
- slots_[slot].indices.data(),
- slots_[slot].sparseNonValueData.data(),
- HPPL_STREAM_1);
- } else if (std::dynamic_pointer_cast(mat)) {
- std::dynamic_pointer_cast(mat)->copyFrom(
- dataPos.data(),
- slots_[slot].indices.data(),
- slots_[slot].sparseNonValueData.data());
- } else {
- LOG(FATAL) << "Not Supported";
- }
- size_t numElements = 0;
- for (auto pos : dataPos) {
- numElements +=
- slots_[slot].indices[pos + 1] - slots_[slot].indices[pos];
- }
- nnzStats_[slot]->addSample(numElements);
-
- break;
- }
- case SlotDef::VECTOR_SPARSE_VALUE: {
- if (!(cpuArguments[slot].value)) {
- cpuArguments[slot].value =
- Matrix::createSparseMatrix(size,
- dim,
- size /*DEFAULT_AVG_WIDTH = 1*/,
- FLOAT_VALUE,
- SPARSE_CSR,
- false,
- useGpu_);
- }
- auto mat = cpuArguments[slot].value;
- mat->resize(size, dim);
- if (std::dynamic_pointer_cast(mat)) {
- std::dynamic_pointer_cast(mat)->copyFrom(
- dataPos.data(),
- slots_[slot].indices.data(),
- slots_[slot].sparseFloatValueData.data(),
- HPPL_STREAM_1);
- } else if (std::dynamic_pointer_cast(mat)) {
- std::dynamic_pointer_cast(mat)->copyFrom(
- dataPos.data(),
- slots_[slot].indices.data(),
- slots_[slot].sparseFloatValueData.data());
- } else {
- LOG(FATAL) << "Not Supported";
- }
- break;
- }
- case SlotDef::INDEX: {
- IVector::resizeOrCreate(cpuArguments[slot].ids,
- size,
- /* useGpu= */ false);
- int* buf = cpuArguments[slot].ids->getData();
- for (int i = 0; i < size; ++i) {
- buf[i] = slots_[slot].indexData[dataPos[i]];
- }
- break;
- }
- case SlotDef::VAR_MDIM_DENSE: {
- CHECK_EQ(size, 1);
- auto mat = cpuArguments[slot].value;
- size_t totalDim = slots_[slot].varDenseData[dataPos[0]].data.size();
-
- CHECK_EQ(slots_[slot].varDenseData[dataPos[0]].dims.size(), size_t(3));
- size_t height, width, depth, oldWidth;
- /* dims[2] is depth, will be changed to dims[0] in future */
- depth = slots_[slot].varDenseData[dataPos[0]].dims[2];
- height = slots_[slot].varDenseData[dataPos[0]].dims[1];
- width = slots_[slot].varDenseData[dataPos[0]].dims[0];
- oldWidth = width;
- /* process the undesirable sample */
- if (oldWidth < height) {
- width = height;
- }
- cpuArguments[slot].setFrameHeight(height);
- cpuArguments[slot].setFrameWidth(width);
-
- if (oldWidth < height) {
- totalDim = width * height * depth;
- }
- Matrix::resizeOrCreate(cpuArguments[slot].value,
- size,
- totalDim,
- false, // trans = false
- false); // useGpu = false
- real* buf = cpuArguments[slot].value->getData();
- cpuArguments[slot].value->zeroMem();
- if (oldWidth < height) {
- real* srcBuf = slots_[slot].varDenseData[dataPos[0]].data.data();
- for (size_t i = 0; i < depth; i++) {
- for (size_t j = 0; j < height; j++) {
- for (size_t k = 0; k < oldWidth; k++) {
- buf[i * height * width + j * width + k] =
- srcBuf[i * height * oldWidth + j * oldWidth + k];
- }
- }
- }
- } else {
- memcpy(buf,
- slots_[slot].varDenseData[dataPos[0]].data.data(),
- sizeof(real) * totalDim);
- }
- ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
- size + 1, /* size == 1 currently */
- /* useGpu= */ false);
- int* bufStarts =
- cpuArguments[slot].sequenceStartPositions->getMutableData(false);
- bufStarts[0] = 0;
- bufStarts[1] = 1;
- break;
- }
- case SlotDef::VAR_MDIM_INDEX: {
- CHECK_EQ(size, 1);
- size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
- IVector::resizeOrCreate(cpuArguments[slot].ids,
- totalDim,
- /* useGpu= */ false);
- int* buf = cpuArguments[slot].ids->getData();
- memcpy(buf,
- slots_[slot].varIndices[dataPos[0]].data(),
- sizeof(int) * totalDim);
-
- ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
- size + 1, /* size == 1 currently */
- /* useGpu= */ false);
- int* bufStarts =
- cpuArguments[slot].sequenceStartPositions->getMutableData(false);
- bufStarts[0] = 0;
- /* we expand the convolutinal feature map to a sequence data,
- * so there should be a corresponding sequence labels */
- bufStarts[1] = totalDim;
- break;
- }
- case SlotDef::STRING: {
- if (cpuArguments[slot].strs) {
- cpuArguments[slot].strs->resize(size);
- } else {
- cpuArguments[slot].strs =
- std::make_shared>(size);
- }
- for (int i = 0; i < size; ++i) {
- (*cpuArguments[slot].strs)[i] = slots_[slot].strData[dataPos[i]];
- }
- break;
- }
- }
- }
-
- if (useGpu_) {
- std::vector& cpuArguments = cpuBatch.getStreams();
- DataBatch& gpuBatch = *gpuBatch_;
- std::vector& gpuArguments = gpuBatch.getStreams();
- gpuArguments.resize(cpuArguments.size());
- gpuBatch.setSize(size);
- for (int i = 0; i < header_.slot_defs_size(); ++i) {
- SlotDef::SlotType slotType = header_.slot_defs(i).type();
- if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
- SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
- gpuArguments[i] = cpuArguments[i];
- gpuArguments[i].sequenceStartPositions =
- cpuArguments[i].sequenceStartPositions;
- } else {
- gpuArguments[i].resizeAndCopyFrom(
- cpuArguments[i], useGpu_, HPPL_STREAM_1);
- }
- }
- hl_stream_synchronize(HPPL_STREAM_1);
- *batch = gpuBatch;
- } else {
- *batch = cpuBatch;
- }
-
- currentSequenceIndex_ += numScannedSeqs;
-
- return batch->getSize();
-}
-
-ProtoSequenceDataProvider::ProtoSequenceDataProvider(const DataConfig& config,
- bool useGpu,
- bool loadDataAll)
- : ProtoDataProvider(config, useGpu, loadDataAll) {}
-
-int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
- DataBatch* batch) {
- CHECK(iidData()) << "ProtoSequenceDataProvider only accepts iid data";
- int64_t numSequences = 0; // actual number of sequences in the batch
-
- // the number of sequences scanned, including those skipped because too long
- int64_t numScannedSeqs = 0;
- std::lock_guard guard(lock_);
- size = std::min(getSize() - currentSequenceIndex_, size);
- numScannedSeqs = numSequences = size;
- if (size <= 0) return 0;
-
- DataBatch& cpuBatch = *cpuBatch_;
- std::vector& cpuArguments = cpuBatch.getStreams();
- cpuBatch.setSize(size);
- cpuArguments.resize(header_.slot_defs_size());
-
- for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
- SlotDef::SlotType slotType = header_.slot_defs(slot).type();
-
- std::vector dataPos;
- dataPos.reserve(size);
- auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
- sampleLoop(op, size);
-
- // current slot: sequenceStartPositions
- ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
- size + 1,
- /* useGpu= */ false);
-
- switch (slotType) {
- case SlotDef::VECTOR_SPARSE_VALUE:
- case SlotDef::VAR_MDIM_DENSE:
- case SlotDef::VAR_MDIM_INDEX: {
- LOG(FATAL) << "ProtoSequenceDataProvider only support"
- << " VECTOR_DENSE, VECTOR_SPARSE_NON_VALUE and INDEX slots";
- break;
- }
- case SlotDef::VECTOR_SPARSE_NON_VALUE: {
- // copy to IDS, not value
- // pointers used in current slot
- sparse_non_value_t* data = slots_[slot].sparseNonValueData.data();
- int64_t* indexs = slots_[slot].indices.data();
- int64_t* seqs = dataPos.data();
-
- // current slot: i need size instances. what is the total length?
- int totalFeatureInCurrentSlot = 0;
- for (int ins = 0; ins < size; ins++) {
- int64_t currInsId = seqs[ins];
- totalFeatureInCurrentSlot +=
- indexs[currInsId + 1] - indexs[currInsId];
- // special: if current instance has NO feature in current slot
- if (indexs[currInsId + 1] == indexs[currInsId]) {
- totalFeatureInCurrentSlot++;
- }
- }
- // done
-
- // current slot: ids
- IVector::resizeOrCreate(cpuArguments[slot].ids,
- totalFeatureInCurrentSlot,
- /* useGpu= */ false);
-
- // where to write
- int* currPosOfArgumentId = cpuArguments[slot].ids->getData();
- int* currPosOfArgumentSeqStart =
- cpuArguments[slot].sequenceStartPositions->getMutableData(false);
- int allSequenceLength = 0;
- currPosOfArgumentSeqStart[0] = 0;
- // for each instance, copy data and fill sequence positions
- for (int instance = 0; instance < size; instance++) {
- int64_t currInstanceId = seqs[instance];
- int64_t currInstanceLength =
- indexs[currInstanceId + 1] - indexs[currInstanceId];
- sparse_non_value_t* currInstanceData = data + indexs[currInstanceId];
- // write sequenceStartPositions
- allSequenceLength += currInstanceLength;
- currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
- // copy features
- for (int featCopier = 0; featCopier < currInstanceLength;
- featCopier++) {
- currPosOfArgumentId[featCopier] = currInstanceData[featCopier].col;
- }
- currPosOfArgumentId += currInstanceLength;
- // special: if current instance has NO feature in current slot
- if (currInstanceLength == 0) {
- allSequenceLength++;
- currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
- currPosOfArgumentId[0] = -1;
- currPosOfArgumentId++;
- }
- // done
- }
- if (slots_[slot].subIndices.size()) {
- std::vector dataSubPos;
- auto op = [this, &dataSubPos](int64_t pos) {
- dataSubPos.push_back(pos);
- };
- int subSize = subSampleLoop(op, size, slot);
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[slot].subSequenceStartPositions, subSize + 1, false);
- int* currPosOfArgumentSubSeqStart =
- cpuArguments[slot].subSequenceStartPositions->getMutableData(
- false);
- int64_t* subSeqs = dataSubPos.data();
- int64_t* subIndexs = slots_[slot].subIndices.data();
- int allSubSequenceLength = 0;
- currPosOfArgumentSubSeqStart[0] = 0;
- // for each instance, compute sub-sequence number
- for (int instance = 0; instance < subSize; instance++) {
- int64_t currSubInstanceId = subSeqs[instance];
- int64_t currSubInstanceLength =
- subIndexs[currSubInstanceId + 1] - subIndexs[currSubInstanceId];
- // write subSequenceStartPositions
- allSubSequenceLength += currSubInstanceLength;
- currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
- // special: if current instance has NO feature in current slot
- if (currSubInstanceLength == 0) {
- allSubSequenceLength++;
- currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
- }
- }
- cpuArguments[slot].checkSubset();
- }
- break;
- }
- case SlotDef::INDEX: {
- // label slot
- IVector::resizeOrCreate(cpuArguments[slot].ids,
- size,
- /* useGpu= */ false);
- // fill labels
- int* buf = cpuArguments[slot].ids->getData();
- for (int i = 0; i < size; ++i) {
- buf[i] = slots_[slot].indexData[dataPos[i]];
- }
- // label HAS sequence structure
- cpuArguments[slot].sequenceStartPositions->fillSequence(false);
- break;
- }
- case SlotDef::VECTOR_DENSE: {
- // copy values
- size_t dim = header_.slot_defs(slot).dim();
- Matrix::resizeOrCreate(cpuArguments[slot].value,
- size,
- dim,
- false, // trans = false
- false); // useGpu = false
- real* buf = cpuArguments[slot].value->getData();
- for (int i = 0; i < size; ++i) {
- memcpy(buf + i * dim,
- slots_[slot].denseData.data() + dataPos[i] * dim,
- sizeof(real) * dim);
- }
- // sequence structure
- cpuArguments[slot].sequenceStartPositions->fillSequence(false);
- break;
- }
- default: { LOG(FATAL) << "should not reach here"; }
- }
- }
-
- if (useGpu_) {
- std::vector& cpuArguments = cpuBatch.getStreams();
- DataBatch& gpuBatch = *gpuBatch_;
- std::vector& gpuArguments = gpuBatch.getStreams();
- gpuArguments.resize(cpuArguments.size());
- gpuBatch.setSize(size);
- for (size_t i = 0; i < cpuArguments.size(); ++i) {
- gpuArguments[i].resizeAndCopyFrom(
- cpuArguments[i], useGpu_, HPPL_STREAM_1);
- }
- hl_stream_synchronize(HPPL_STREAM_1);
- *batch = gpuBatch;
- } else {
- *batch = cpuBatch;
- }
-
- currentSequenceIndex_ += numScannedSeqs;
- return batch->getSize();
-}
-
-} // namespace paddle
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
deleted file mode 100644
index 7dd45e062248f20d24c633dd4e1c8b7eebcbfa1b..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include
-
-#include "DataFormat.pb.h"
-#include "paddle/utils/Stat.h"
-
-#include "DataProvider.h"
-#include "ProtoReader.h"
-
-namespace paddle {
-
-/**
- * @brief Provider data from protobuf data file with each sample
- * specified by proto message
- *
- * DataSample defined in DataFormat.proto.
- *
- * The file format is
- *
- * header
- *
- * sample1
- *
- * sample2
- *
- * ...
- *
- * sampleN
- *
- * @note: In the data file, each message is prefixed with its length.
- * The read/write of the protbuf are implemented in ProtoReader.h
- */
-class ProtoDataProvider : public DataProvider {
-public:
- ProtoDataProvider(const DataConfig& config,
- bool useGpu,
- bool loadDataAll = true);
- virtual void reset();
-
- /**
- * @note this size includes the sequences which are skipped because they
- * are longer than the batch size.
- */
- virtual int64_t getSize() {
- int64_t size = sampleNums_;
- if (usageRatio_ < 1.0f) {
- size = static_cast(size * usageRatio_);
- }
- return size;
- }
- virtual void shuffle();
-
- void loadData(const std::vector& fileList);
-
- virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
-protected:
- /**
- * @brief load protobuf data from a list of file
- * @param[in] fileName file name of a file which contains
- * a list of file names
- */
- void loadData(const std::string& fileName);
-
- /**
- * @brief load protobuf data from file
- * @param[in] fileName data file name
- */
- void loadDataFile(const std::string& fileName);
- /** @brief check data header of each data sample
- * @param[in] header data header read from protobuf data
- */
- void checkDataHeader(const DataHeader& header);
- /**
- * @brief fill protobuf data into slot_,
- * slot_ is a vector of ProtoSlot in memory.
- * @param[in] sample data sample read from protobuf data
- */
- void fillSlots(const DataSample& sample);
-
- /**
- * @brief return true if each sample is one sequence, i.e., independent
- * of other samples.
- */
- inline bool iidData() const { return sequenceStartPositions_.empty(); }
-
- /**
- * @brief check that sample is consistent with header_
- */
- void checkSample(const DataSample& sample);
-
- template
- int64_t sequenceLoop(Op op, int64_t size);
-
- template
- int64_t sampleLoop(Op op, int64_t size);
-
- template
- int64_t subSampleLoop(Op op, int64_t size, int slot);
-
- void showDataStats();
-
-protected:
- struct ProtoVarSlot {
- std::vector data;
- std::vector dims;
- };
-
- struct ProtoSlot {
- SlotDef::SlotType type;
- int dim;
- std::vector indexData;
- std::vector denseData;
- std::vector sparseNonValueData;
- std::vector sparseFloatValueData;
- std::vector indices;
- std::vector subIndices;
-
- std::vector varDenseData;
- std::vector> varIndices;
- std::vector strData;
- };
- DataHeader header_;
- int numVecSlots_;
-
- std::vector slots_;
- size_t sampleNums_;
-
- /**
- * The starting position of each sequence in samples.
- * The last element should be num of samples.
- * If empty, each sample is one sequence.
- */
- std::vector sequenceStartPositions_;
-
- int64_t currentSequenceIndex_;
-
- // The size should be the number of sequences.
- std::vector shuffledSequenceIds_;
-
- ThreadLocalD cpuBatch_;
- ThreadLocalD gpuBatch_;
-
- RWLock lock_;
- std::vector nnzStats_; // stats for number of none-zeros entries
-};
-
-/**
- * @brief Special use for Proto data: instances should contain sparse-non-value
- * slots
- * and label.
- *
- * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
- */
-class ProtoSequenceDataProvider : public ProtoDataProvider {
-public:
- ProtoSequenceDataProvider(const DataConfig& config,
- bool useGpu,
- bool loadDataAll = true);
- ~ProtoSequenceDataProvider() {}
- virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-};
-
-} // namespace paddle
diff --git a/paddle/gserver/layers/DotProdLayer.cpp b/paddle/gserver/layers/DotProdLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e2dbe3c3c416f606d2938701f26288642b55267
--- /dev/null
+++ b/paddle/gserver/layers/DotProdLayer.cpp
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for computing the dot product of two vectors.
+ * Input1: vector (batchSize * dim)
+ * Input2: vector (batchSize * dim)
+ * Output: a matrix: (batchSize * 1)
+ */
+
+class DotProdLayer : public Layer {
+public:
+ explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
+
+ ~DotProdLayer() {}
+
+ bool init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) override;
+
+ void forward(PassType passType) override;
+ void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(dot_prod, DotProdLayer);
+
+bool DotProdLayer::init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) {
+ Layer::init(layerMap, parameterMap);
+
+ CHECK_EQ(inputLayers_.size(), 2U);
+ CHECK_EQ(1UL, getSize())
+ << "The output dimensionality of this layer should be fixed to 1.";
+
+ return true;
+}
+
+void DotProdLayer::forward(PassType passType) {
+ Layer::forward(passType);
+
+ MatrixPtr inV0 = getInputValue(0);
+ MatrixPtr inV1 = getInputValue(1);
+
+ size_t batchSize = inV0->getHeight();
+ CHECK_EQ(inV1->getHeight(), batchSize);
+ CHECK_EQ(inV0->getWidth(), inV1->getWidth());
+
+ {
+ REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+ reserveOutput(batchSize, 1);
+ }
+
+ MatrixPtr outV = getOutputValue();
+ {
+ REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
+ outV->sumOfProducts(*inV0, *inV1, 1, 0);
+ }
+}
+
+void DotProdLayer::backward(const UpdateCallback& callback) {
+ MatrixPtr inV0 = getInputValue(0);
+ MatrixPtr inV1 = getInputValue(1);
+ MatrixPtr outG = getOutputGrad();
+ MatrixPtr inG0 = getInputGrad(0);
+ MatrixPtr inG1 = getInputGrad(1);
+
+ {
+ REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
+
+ if (inG0) {
+ inG0->addRowScale(0, *inV1, *outG);
+ }
+
+ if (inG1) {
+ inG1->addRowScale(0, *inV0, *outG);
+ }
+ }
+}
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.cpp b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c9099297cc5c741fbae0b42f21b988e6c561ef11
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
@@ -0,0 +1,202 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNConcatLayer.h"
+
+using namespace mkldnn; // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer);
+
+bool MKLDNNConcatLayer::init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) {
+ if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+ return false;
+ }
+ CHECK_GT(inputLayers_.size(), 1UL);
+ CHECK(!biasParameter_);
+ return true;
+}
+
+void MKLDNNConcatLayer::reshape(
+ int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+ reshapeInput(bs, ih, iw);
+ ic = inputLayers_[0]->getSize() / ih / iw;
+ CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
+ CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
+ CHECK_GT(inputLayers_.size(), 1UL);
+ channels_.resize(inputLayers_.size());
+ channels_[0] = ic;
+ // need change the output channel, so use oc_ instead
+ // TODO(TJ): change API, use &oc
+ oc_ = ic;
+ for (size_t i = 1; i < inputLayers_.size(); i++) {
+ int batchsize, height, witdh;
+ reshapeInput(batchsize, height, witdh, i);
+ CHECK_EQ(bs, batchsize);
+ CHECK_EQ(ih, height);
+ CHECK_EQ(iw, witdh);
+
+ channels_[i] = inputLayers_[i]->getSize() / height / witdh;
+ CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize());
+ oc_ += channels_[i];
+ }
+ oh = ih;
+ ow = iw;
+ reshapeOutput(oh, ow);
+ resizeOutput(bs, oc_ * oh * ow);
+}
+
+void MKLDNNConcatLayer::resetFwd(std::vector& pipeline,
+ MKLDNNMatrixPtr& in,
+ MKLDNNMatrixPtr& wgt,
+ MKLDNNMatrixPtr& bias,
+ MKLDNNMatrixPtr& out) {
+ resetFwdBuffers(inVals_, out);
+ in = inVals_[0];
+
+ std::shared_ptr fwdPD;
+ resetFwdPD(fwdPD, inVals_, out);
+
+ resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+}
+
+void MKLDNNConcatLayer::resetBwd(std::vector& pipeline,
+ MKLDNNMatrixPtr& in,
+ MKLDNNMatrixPtr& wgt,
+ MKLDNNMatrixPtr& bias,
+ MKLDNNMatrixPtr& out) {
+ resetBwdBuffers(inGrads_, out);
+ in = inGrads_[0];
+
+ resetBwdPipeline(pipeline, bwds_, inGrads_, out);
+}
+
+void MKLDNNConcatLayer::resetFwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& out) {
+ inputs.resize(inputLayers_.size());
+ bool has8c = false, has16c = false, hasnc = false;
+ for (size_t i = 0; i < inputs.size(); i++) {
+ // resetInValue will use ic_ so temporary change as current input's channel
+ // TODO(TJ): change ic_ as vector then can remove channels_
+ ic_ = channels_[i];
+ resetInValue(inputs[i], nullptr, i);
+ CHECK(inputs[i]);
+ auto dm = inputs[i]->getDims();
+ // inputs format can be different, but ndims must equal
+ CHECK(i == 0 || dm.size() == inputs[0]->getDims().size());
+ CHECK_EQ(bs_, dm[0]);
+ CHECK_EQ(channels_[i], dm[1]);
+ if (dm.size() > 2) {
+ CHECK_EQ(ih_, dm[2]);
+ CHECK_EQ(iw_, dm[3]);
+ }
+ if (inputs[i]->getFormat() == format::nc) {
+ hasnc = true;
+ }
+ if (inputs[i]->getFormat() == format::nChw8c) {
+ has8c = true;
+ }
+ if (inputs[i]->getFormat() == format::nChw16c) {
+ has16c = true;
+ }
+ }
+ // change back, ic_ always save the input 0 size
+ ic_ = channels_[0];
+
+ format outFmt;
+ if (has16c && oc_ % 16 == 0) {
+ outFmt = format::nChw16c;
+ } else if (has8c && oc_ % 8 == 0) {
+ outFmt = format::nChw8c;
+ } else if (hasnc) {
+ CHECK(oh_ == 1 && ow_ == 1);
+ outFmt = format::nc;
+ } else {
+ outFmt = format::nchw;
+ }
+ memory::dims outDims =
+ hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_};
+ auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_);
+ resetOutValue(out, outPD);
+}
+
+void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr& pd,
+ std::vector& inputs,
+ MKLDNNMatrixPtr out) {
+ std::vector srcPDs;
+ for (size_t i = 0; i < inputs.size(); i++) {
+ srcPDs.push_back(inputs[i]->getPrimitiveDesc());
+ }
+ CHECK(out);
+ pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs));
+ CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+}
+
+void MKLDNNConcatLayer::resetFwdPipeline(
+ std::vector& pipeline,
+ std::shared_ptr& pd,
+ std::vector& inputs,
+ MKLDNNMatrixPtr& out) {
+ std::vector srcs;
+ for (size_t i = 0; i < inputs.size(); i++) {
+ srcs.push_back(*(inputs[i]));
+ }
+ fwd_.reset(new concat(*pd, srcs, *out));
+ pipeline.push_back(*fwd_);
+}
+
+void MKLDNNConcatLayer::resetBwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& out) {
+ CHECK(outVal_);
+ resetOutGrad(out, outVal_->getPrimitiveDesc());
+ CHECK(out);
+
+ inputs.resize(inputLayers_.size());
+ for (size_t i = 0; i < inputs.size(); i++) {
+ CHECK(inVals_[i]);
+ // resetInGrad will use inVal_
+ // TODO(TJ): change move inVals_ to MKLDNNLayer ans remove inVal_
+ inVal_ = inVals_[i];
+ resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
+ CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc());
+ }
+ // change back, inVal_ always save the input 0
+ inVal_ = inVals_[0];
+}
+
+void MKLDNNConcatLayer::resetBwdPipeline(
+ std::vector& pipeline,
+ std::vector>& prims,
+ std::vector& inputs,
+ MKLDNNMatrixPtr& out) {
+ // reset the backward primitives
+ memory::dims offsets = {0, 0, 0, 0};
+ prims.resize(inputs.size());
+ CHECK_EQ(inputs.size(), channels_.size());
+ for (size_t i = 0; i < inputs.size(); i++) {
+ auto viewPD = view::primitive_desc(
+ out->getPrimitiveDesc(), inputs[i]->getDims(), offsets);
+ auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(),
+ inputs[i]->getPrimitiveDesc());
+ prims[i].reset(new reorder(bwdPD, *out, *(inputs[i])));
+ offsets[axis_] += channels_[i];
+ // push to pipeline
+ pipeline.push_back(*prims[i]);
+ }
+}
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.h b/paddle/gserver/layers/MKLDNNConcatLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5749d327e4259b81541a234f48a4538ab035fe4
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer Concatenate layer.
+ *
+ * The config file api is mkldnn_concat
+ */
+class MKLDNNConcatLayer : public MKLDNNLayer {
+protected:
+ std::vector inVals_;
+ std::vector inGrads_;
+ std::vector> bwds_;
+ // input channel numbers
+ std::vector channels_;
+
+ // concat_dimension in MKLDNN
+ // if axis_ == 0, concat batchsize
+ // if axis_ == 1, concat channel (default)
+ int axis_;
+
+public:
+ explicit MKLDNNConcatLayer(const LayerConfig& config)
+ : MKLDNNLayer(config), axis_(1) {}
+
+ ~MKLDNNConcatLayer() {}
+
+ bool init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) override;
+
+ void reshape(
+ int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+
+ void resetFwd(std::vector& pipeline,
+ MKLDNNMatrixPtr& in,
+ MKLDNNMatrixPtr& wgt,
+ MKLDNNMatrixPtr& bias,
+ MKLDNNMatrixPtr& out) override;
+
+ void resetBwd(std::vector& pipeline,
+ MKLDNNMatrixPtr& in,
+ MKLDNNMatrixPtr& wgt,
+ MKLDNNMatrixPtr& bias,
+ MKLDNNMatrixPtr& out) override;
+
+ void printSizeInfo() override {
+ CHECK_EQ(channels_.size(), inputLayers_.size());
+ for (size_t i = 0; i < channels_.size(); ++i) {
+ VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
+ << ": " << bs_ << ", " << channels_[i] << ", " << ih_
+ << ", " << iw_;
+ }
+ VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
+ << ", " << ow_;
+ }
+
+ void printValueFormat() override {
+ for (size_t i = 0; i < inVals_.size(); ++i) {
+ VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+ << ": " << inVals_[i]->getFormat() << " >>>";
+ }
+ if (outVal_) {
+ VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
+ }
+ if (extOutVal_) {
+ VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+ }
+ }
+
+ void printGradFormat() override {
+ if (extOutGrad_) {
+ VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
+ }
+ if (outGrad_) {
+ VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+ }
+ for (size_t i = 0; i < inGrads_.size(); ++i) {
+ VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+ << ": " << inGrads_[i]->getFormat() << "<<<";
+ }
+ }
+
+protected:
+ /**
+ * Forward functions: reset buffers(inputs, output, bias),
+ * reset primitive descriptor,
+ * reset pipeline.
+ */
+ void resetFwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& out);
+ void resetFwdPD(std::shared_ptr& pd,
+ std::vector& inputs,
+ MKLDNNMatrixPtr out);
+ void resetFwdPipeline(std::vector& pipeline,
+ std::shared_ptr& pd,
+ std::vector& inputs,
+ MKLDNNMatrixPtr& out);
+
+ /**
+ * Backward functions: reset buffers(inputs, output, bias)
+ * reset primitives and pipeline
+ */
+ void resetBwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& out);
+ void resetBwdPipeline(std::vector& pipeline,
+ std::vector>& prims,
+ std::vector& inputs,
+ MKLDNNMatrixPtr& out);
+};
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
index e75ac5ba4647a8267b7bc189893bd7adb5c3053f..cf42da0735282d667d6b87061c8c59bf2f96e0be 100644
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -21,8 +21,8 @@ namespace paddle {
bool MKLDNNLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
- CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
- << "Please set WITH_MKLDNN=ON "
+ CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn."
+ << "Please set WITH_MKL=ON "
<< "and set use_mkldnn=True";
CHECK(!useGpu_) << "Do not support GPU yet";
@@ -138,8 +138,11 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
}
}
-void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) {
- const Argument& input = inputLayers_[0]->getOutput();
+void MKLDNNLayer::reshapeInput(int& batchsize,
+ int& height,
+ int& width,
+ size_t inputIdx) {
+ const Argument& input = inputLayers_[inputIdx]->getOutput();
batchsize = input.getBatchSize();
int h = input.getFrameHeight();
int w = input.getFrameWidth();
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 7479c34c92b5231b2521493bc631474d4efd4224..4c42df1bee75fa7b28c2001c30797cc0df7c5554 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -178,7 +178,10 @@ protected:
/**
* reshape the input image sizes and input batchsize
*/
- void reshapeInput(int& batchsize, int& height, int& width);
+ void reshapeInput(int& batchsize,
+ int& height,
+ int& width,
+ size_t inputIdx = 0);
/**
* reshape output image sizes
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 4bea348f637f39444e8aad89278e6366ecd73b1d..c295ea19c9ccb3d05c509a41925d2c36efdba8ef 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -29,7 +29,7 @@ gserver_test(test_KmaxSeqScore)
gserver_test(test_Expand)
gserver_test(test_MaxPoolingWithMaskOutput)
-########## test_Mkldnn layers and activations ##########
+########## test_MKLDNN layers and activations ##########
if(WITH_MKLDNN)
add_unittest_without_exec(test_MKLDNN
test_MKLDNN.cpp
@@ -62,17 +62,6 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
endif()
if(NOT MOBILE_INFERENCE)
-################### test_ProtoDataProvider ############
- add_unittest_without_exec(test_ProtoDataProvider
- test_ProtoDataProvider.cpp)
-
- # test_ProtoDataProvider will mkdir as same name,
- # so if WORKING_DIRECTORY is default directory, then
- # mkdir will get error.
- add_test(NAME test_ProtoDataProvider
- COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
- WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-
################## test_Evaluator #######################
add_unittest(test_Evaluator
test_Evaluator.cpp)
@@ -110,3 +99,24 @@ add_test(NAME test_PyDataProvider2
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
)
+
+################# test_CompareSparse ##################
+add_unittest_without_exec(test_CompareSparse
+ test_CompareSparse.cpp)
+if(NOT ON_TRAVIS)
+ add_test(NAME test_CompareSparse
+ COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+ ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+ ./.set_port.sh -p port -n 6
+ ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+ WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endif()
+
+################ test_CompareTwoNets ######################
+add_unittest_without_exec(test_CompareTwoNets
+ test_CompareTwoNets.cpp)
+add_test(NAME test_CompareTwoNets
+ COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+ ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+ ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
+ WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index ca55a45bc77b4e171619ab788d7c7dfeefcd036a..9d61533c0b6f20c41130d7b7c15ad93392b2d24c 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -23,7 +23,7 @@ limitations under the License. */
namespace paddle {
/**
- * @brief test the functionality of Mkldnnlayers
+ * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
* refer to paddle original function
*/
class MKLDNNTester {
diff --git a/paddle/gserver/tests/proto_files.txt b/paddle/gserver/tests/proto_files.txt
deleted file mode 100644
index 691b38c7940bd21360eb00384e060554aa4b3e22..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/proto_files.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-./test_ProtoDataProvider/data1.bin
-./test_ProtoDataProvider/data2.bin
diff --git a/paddle/gserver/tests/proto_files_compressed.txt b/paddle/gserver/tests/proto_files_compressed.txt
deleted file mode 100644
index 7413c81e185d02e0d03aefa06480b9722357c5eb..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/proto_files_compressed.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-./test_ProtoDataProvider/data1.bin.gz
-./test_ProtoDataProvider/data2.bin.gz
diff --git a/paddle/gserver/tests/sequence_lstm.conf b/paddle/gserver/tests/sequence_lstm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..f49a827f22edce056eaf9903e99b732cab7f3784
--- /dev/null
+++ b/paddle/gserver/tests/sequence_lstm.conf
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+ dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+ train_list='gserver/tests/Sequence/train.list',
+ test_list=None,
+ module='sequenceGen',
+ obj='process',
+ args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+sparse_update = get_config_arg("sparse_update", bool, False)
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+ input=data,
+ size=word_dim,
+ param_attr=ParamAttr(sparse_update=sparse_update))
+
+with mixed_layer(size=hidden_dim * 4) as lstm_input:
+ lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory(
+ input=lstm_input,
+ act=TanhActivation(),
+ gate_act=SigmoidActivation(),
+ state_act=TanhActivation())
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(
+ size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+ output += full_matrix_projection(input=lstm_last)
+
+outputs(
+ classification_cost(
+ input=output, label=data_layer(
+ name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_recurrent.py b/paddle/gserver/tests/sequence_recurrent.py
new file mode 100644
index 0000000000000000000000000000000000000000..4895df186bfecc5cb5263676a9cd5bac5039d565
--- /dev/null
+++ b/paddle/gserver/tests/sequence_recurrent.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+ dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+ train_list='gserver/tests/Sequence/train.list',
+ test_list=None,
+ module='sequenceGen',
+ obj='process',
+ args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent_group.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+ input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+ size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+ output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+ classification_cost(
+ input=output, label=data_layer(
+ name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_recurrent_group.py b/paddle/gserver/tests/sequence_recurrent_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1d54542e3bc4e89f70d31d5e89c0f44953c9f90
--- /dev/null
+++ b/paddle/gserver/tests/sequence_recurrent_group.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+ dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+ train_list='gserver/tests/Sequence/train.list',
+ test_list=None,
+ module='sequenceGen',
+ obj='process',
+ args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+ input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+
+def step(y):
+ mem = memory(name="rnn_state", size=hidden_dim)
+ with mixed_layer(
+ name="rnn_state",
+ size=hidden_dim,
+ bias_attr=False,
+ act=SoftmaxActivation()) as out:
+ out += identity_projection(input=y)
+ out += full_matrix_projection(
+ input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
+ return out
+
+
+recurrent = recurrent_group(name="rnn", step=step, input=emb)
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+ size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+ output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+ classification_cost(
+ input=output, label=data_layer(
+ name="label", size=1)))
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/gserver/tests/test_CompareSparse.cpp
similarity index 98%
rename from paddle/trainer/tests/test_CompareSparse.cpp
rename to paddle/gserver/tests/test_CompareSparse.cpp
index 5f1834bd730375fc10762fc19788d0c693f8e752..c6e07650fc4805a25baf38b9059f6c996d00cafc 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/gserver/tests/test_CompareSparse.cpp
@@ -22,8 +22,7 @@ limitations under the License. */
using namespace paddle; // NOLINT
using namespace std; // NOLINT
-static const string& configFile1 =
- "trainer/tests/sample_trainer_config_compare_sparse.conf";
+static const string& configFile1 = "gserver/tests/sequence_lstm.conf";
DECLARE_bool(use_gpu);
DECLARE_string(config);
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/gserver/tests/test_CompareTwoNets.cpp
similarity index 95%
rename from paddle/trainer/tests/test_CompareTwoNets.cpp
rename to paddle/gserver/tests/test_CompareTwoNets.cpp
index 94f65e545d116c802fb4877dc14f07aaaf83a4fb..801d9607565910b1f7f68a9c4532de5877e44f30 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/gserver/tests/test_CompareTwoNets.cpp
@@ -30,8 +30,6 @@ DECLARE_bool(use_gpu);
DECLARE_string(config);
DECLARE_string(nics);
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
DEFINE_bool(need_high_accuracy,
false,
"whether need to run in double accuracy");
@@ -42,6 +40,10 @@ DEFINE_double(
DECLARE_bool(thread_local_rand_use_global_seed);
DECLARE_int32(seed);
+static const string& config_file_a = "gserver/tests/sequence_recurrent.py";
+static const string& config_file_b =
+ "gserver/tests/sequence_recurrent_group.py";
+
struct ComData {
vector outArgs;
vector parameters;
@@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) {
DataBatch dataBatch;
int32_t batchSize = trainer.getConfig().opt_config().batch_size();
+ trainer.getDataProvider()->reset();
trainer.getDataProvider()->setSkipShuffle();
trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
@@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
TEST(Trainer, create) {
ComData dataA;
- calcGradient(dataA, FLAGS_config_file_a);
+ calcGradient(dataA, config_file_a);
LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
ComData dataB;
- calcGradient(dataB, FLAGS_config_file_b);
+ calcGradient(dataB, config_file_b);
LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
compareGradient(dataA, dataB);
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 18f8d602b223cb4f2f620417862264776e752620..cacf10692942f5eca2f6c498183f4acc00768460 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1082,6 +1082,21 @@ TEST(Layer, InterpolationLayer) {
}
}
+TEST(Layer, DotProdLayer) {
+ TestConfig config;
+ config.layerConfig.set_type("dot_prod");
+ config.layerConfig.set_size(1);
+
+ config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+ config.layerConfig.add_inputs();
+ config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+ config.layerConfig.add_inputs();
+
+ for (auto useGpu : {false, true}) {
+ testLayerGrad(config, "dot_prod", 10, false, useGpu);
+ }
+}
+
TEST(Layer, OuterProdLayer) {
TestConfig config;
config.layerConfig.set_type("out_prod");
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index a859e34c8996d81f14bf1edcb6e23d5a4f687e6b..42644e9601a82ea81c417adc6441edeb036998e2 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -313,6 +313,47 @@ TEST(MKLDNNLayer, AddtoLayer) {
testAddtoLayer({4, 12, 1, 1}, 3);
}
+static void getMKLDNNConcatConfig(TestConfig& cfg,
+ const std::vector& inputs) {
+ CHECK_GE(inputs.size(), 2) << "at least two inputs";
+ int oc = inputs[0].ic;
+ for (size_t i = 1; i < inputs.size(); ++i) {
+ CHECK_EQ(inputs[i].bs, inputs[0].bs);
+ CHECK_EQ(inputs[i].ih, inputs[0].ih);
+ CHECK_EQ(inputs[i].iw, inputs[0].iw);
+ oc += inputs[i].ic;
+ }
+ cfg.biasSize = 0;
+ cfg.layerConfig.set_type("mkldnn_concat");
+ cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
+ cfg.layerConfig.set_active_type("relu");
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ std::stringstream ss;
+ ss << "layer_" << i;
+ cfg.inputDefs.push_back(
+ {INPUT_DATA,
+ ss.str(),
+ (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
+ 0});
+ LayerInputConfig* input = cfg.layerConfig.add_inputs();
+ ImageConfig* img_conf = input->mutable_image_conf();
+ img_conf->set_channels(inputs[i].ic);
+ img_conf->set_img_size_y(inputs[i].ih);
+ img_conf->set_img_size(inputs[i].iw);
+ }
+}
+
+void testConcatLayer(const std::vector& inputs) {
+ TestConfig dnnConfig;
+ getMKLDNNConcatConfig(dnnConfig, inputs);
+ RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
+}
+
+TEST(MKLDNNLayer, ConcatLayer) {
+ testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
+ testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
+}
+
void testActivation(std::string actType, const testImageDesc& pm) {
// TODO(TJ): remove me when paddle support elu activation
if (actType == "mkldnn_elu") {
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
deleted file mode 100644
index af6472619d1840e82787974d265d601b4a406c09..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ /dev/null
@@ -1,732 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include
-#include
-
-#include
-
-#include "paddle/gserver/dataproviders/ProtoDataProvider.h"
-#include "paddle/utils/Util.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace std; // NOLINT
-
-std::vector protoFiles{
- "./test_ProtoDataProvider/data1.bin", "./test_ProtoDataProvider/data2.bin",
-};
-std::vector protoFilesCompressed{
- "./test_ProtoDataProvider/data1.bin.gz",
- "./test_ProtoDataProvider/data2.bin.gz",
-};
-
-const char* kTestDir = "./test_ProtoDataProvider";
-const char kProtoFileList[] = "gserver/tests/proto_files.txt";
-const char kProtoFileListCompressed[] =
- "gserver/tests/proto_files_compressed.txt";
-const int kSpraseMatrixDim = 1024;
-
-using namespace paddle; // NOLINT
-
-void prepareData(DataBatch* batch,
- const int* numPerSlotType,
- bool iid,
- bool useGpu) {
- batch->clear();
- int64_t size = uniformRandom(100) + 10;
- batch->setSize(size);
-
- ICpuGpuVectorPtr sequenceStartPositions;
- ICpuGpuVectorPtr subSequenceStartPositions;
- if (!iid) {
- int numSeqs = uniformRandom(10) + 1;
- sequenceStartPositions =
- ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
- int* buf = sequenceStartPositions->getMutableData(false);
- subSequenceStartPositions =
- ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
- int* subBuf = subSequenceStartPositions->getMutableData(false);
- int64_t pos = 0;
- int maxLen = 2 * size / numSeqs;
- for (int i = 0; i < numSeqs; ++i) {
- int len =
- uniformRandom(min(maxLen, size - pos - numSeqs + i)) + 1;
- buf[i] = pos;
- subBuf[i] = pos;
- pos += len;
- VLOG(1) << " len=" << len;
- }
- buf[numSeqs] = size;
- subBuf[numSeqs] = size;
- }
-
- vector& arguments = batch->getStreams();
- for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_DENSE]; ++i) {
- int64_t dim = rand() % 10 + 4; // NOLINT rand_r
- MatrixPtr mat = Matrix::create(size, dim, /* trans= */ false, false);
- mat->randomizeUniform();
- Argument arg;
- arg.value = mat;
- arg.sequenceStartPositions = sequenceStartPositions;
- arguments.push_back(arg);
- }
- for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE]; ++i) {
- MatrixPtr mat =
- makeRandomSparseMatrix(size, kSpraseMatrixDim, false, useGpu);
- Argument arg;
- arg.value = mat;
- arg.sequenceStartPositions = sequenceStartPositions;
- arg.subSequenceStartPositions = subSequenceStartPositions;
- arguments.push_back(arg);
- }
- for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE]; ++i) {
- MatrixPtr mat =
- makeRandomSparseMatrix(size, kSpraseMatrixDim, true, useGpu);
- Argument arg;
- arg.value = mat;
- arg.sequenceStartPositions = sequenceStartPositions;
- arguments.push_back(arg);
- }
- for (int i = 0; i < numPerSlotType[SlotDef::STRING]; ++i) {
- int64_t dim = rand() % 10 + 4; // NOLINT rand_r
- SVectorPtr vec = std::make_shared>();
- for (int j = 0; j < size; ++j) {
- vec->push_back(randStr(dim));
- }
- Argument arg;
- arg.strs = vec;
- arg.sequenceStartPositions = sequenceStartPositions;
- arguments.push_back(arg);
- }
- for (int i = 0; i < numPerSlotType[SlotDef::INDEX]; ++i) {
- int64_t dim = rand() % 10 + 4; // NOLINT rand_r
- IVectorPtr vec = IVector::create(size, /* useGpu= */ false);
- int* buf = vec->getData();
- for (int j = 0; j < size; ++j) {
- buf[j] = uniformRandom(dim);
- }
- Argument arg;
- arg.ids = vec;
- arg.sequenceStartPositions = sequenceStartPositions;
- arguments.push_back(arg);
- }
-}
-
-inline int getSlotDim(const Argument& arg) {
- if (arg.value) {
- return arg.value->getWidth();
- } else if (arg.ids) {
- return arg.ids->getMax() + 1;
- } else if (arg.strs) {
- return 1;
- }
- LOG(FATAL) << "Invalid argument";
- return 0;
-}
-
-inline SlotDef::SlotType getSlotType(const Argument& arg) {
- if (arg.value) {
- auto& m = *arg.value;
- auto& type = typeid(m);
- if (type == typeid(CpuMatrix) || type == typeid(GpuMatrix)) {
- return SlotDef::VECTOR_DENSE;
- }
- if (type == typeid(CpuSparseMatrix)) {
- auto valueType =
- std::dynamic_pointer_cast(arg.value)->getValueType();
- if (NO_VALUE == valueType) {
- return SlotDef::VECTOR_SPARSE_NON_VALUE;
- } else {
- return SlotDef::VECTOR_SPARSE_VALUE;
- }
- }
- if (type == typeid(GpuSparseMatrix)) {
- auto valueType =
- std::dynamic_pointer_cast(arg.value)->getValueType();
- if (NO_VALUE == valueType) {
- return SlotDef::VECTOR_SPARSE_NON_VALUE;
- } else {
- return SlotDef::VECTOR_SPARSE_VALUE;
- }
- }
-
- LOG(FATAL) << "Unknown matrix type";
- }
- if (arg.ids) return SlotDef::INDEX;
- if (arg.strs) return SlotDef::STRING;
- LOG(FATAL) << "Invalid argument";
- return SlotDef::VECTOR_DENSE;
-}
-
-void getColRow(const Argument& arg,
- int64_t pos,
- bool useGpu,
- int* colNum,
- const int** rowCols,
- const real** rowValues) {
- SlotDef::SlotType type = getSlotType(arg);
- GpuSparseMatrixPtr matGpu;
- CpuSparseMatrixPtr matCpu;
- if (useGpu) {
- matGpu = dynamic_pointer_cast(arg.value);
- ASSERT_TRUE(matGpu != NULL);
- } else {
- matCpu = dynamic_pointer_cast(arg.value);
- ASSERT_TRUE(matCpu != NULL);
- }
- *colNum = useGpu ? matGpu->getColNum(pos) : matCpu->getColNum(pos);
- *rowCols = useGpu ? matGpu->getRowCols(pos) : matCpu->getRowCols(pos);
- if (type == SlotDef::VECTOR_SPARSE_VALUE) {
- *rowValues = useGpu ? matGpu->getRowValues(pos) : matCpu->getRowValues(pos);
- } else {
- *rowValues = NULL;
- }
-}
-
-void makeSample(const vector& arguments,
- int64_t pos,
- bool isBeginning,
- DataSample* sample,
- bool useGpu) {
- sample->set_is_beginning(isBeginning);
- int slotid = 0;
- for (auto& arg : arguments) {
- SlotDef::SlotType type = getSlotType(arg);
- int64_t dim = getSlotDim(arg);
- switch (type) {
- case SlotDef::VECTOR_DENSE: {
- VectorSlot* vecSlot = sample->add_vector_slots();
- auto values = vecSlot->mutable_values();
- values->Reserve(dim);
- for (int i = 0; i < dim; ++i) {
- values->AddAlreadyReserved(
- static_cast(arg.value->getElement(pos, i)));
- }
- break;
- }
- case SlotDef::INDEX: {
- sample->add_id_slots(arg.ids->get(pos));
- break;
- }
- case SlotDef::VECTOR_SPARSE_NON_VALUE: {
- VectorSlot* vecSlot = sample->add_vector_slots();
- auto ids = vecSlot->mutable_ids();
- int colNum;
- const int* rowCols;
- const real* rowValues; // nullptr
- getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
- ids->Reserve(colNum);
- for (int i = 0; i < colNum; ++i) {
- ids->AddAlreadyReserved(rowCols[i]);
- }
- SubseqSlot* subseqSlot = sample->add_subseq_slots(); // subseq
- subseqSlot->set_slot_id(slotid);
- auto lens = subseqSlot->mutable_lens();
- lens->Add(colNum);
- break;
- }
- case SlotDef::VECTOR_SPARSE_VALUE: {
- VectorSlot* vecSlot = sample->add_vector_slots();
- auto values = vecSlot->mutable_values();
- auto ids = vecSlot->mutable_ids();
- int colNum;
- const int* rowCols;
- const real* rowValues;
- getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
- ids->Reserve(colNum);
- values->Reserve(colNum);
- for (int i = 0; i < colNum; ++i) {
- ids->AddAlreadyReserved(rowCols[i]);
- values->AddAlreadyReserved(rowValues[i]);
- }
- break;
- }
- case SlotDef::VAR_MDIM_DENSE:
- case SlotDef::VAR_MDIM_INDEX: {
- LOG(FATAL) << "Not implemented";
- break;
- }
- case SlotDef::STRING: {
- VectorSlot* vecSlot = sample->add_vector_slots();
- vecSlot->add_strs((*arg.strs)[pos]);
- break;
- }
- }
- slotid++;
- }
-}
-
-void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
- DataHeader header;
- const vector& arguments = batch.getStreams();
- for (auto& argument : arguments) {
- SlotDef* slotDef = header.add_slot_defs();
- slotDef->set_type(getSlotType(argument));
- slotDef->set_dim(getSlotDim(argument));
- }
- VLOG(1) << "header=" << header.DebugString();
-
- int64_t totalSeqs = batch.getNumSequences();
- int64_t seq = 0;
- ICpuGpuVectorPtr sequenceStartPositions = arguments[0].sequenceStartPositions;
- int64_t numWritten = 0;
- vector curProtoFiles =
- dataCompression ? protoFilesCompressed : protoFiles;
- for (size_t i = 0; i < curProtoFiles.size(); ++i) {
- int64_t numSeqs = totalSeqs * (i + 1) / curProtoFiles.size() -
- totalSeqs * i / curProtoFiles.size();
- ofstream os(curProtoFiles[i]);
- CHECK(os) << "Fail to open " << curProtoFiles[i];
- unique_ptr writer(new ProtoWriter(&os, dataCompression));
- CHECK(writer->write(header));
- for (int j = 0; j < numSeqs; ++j, ++seq) {
- int64_t begin = seq;
- int64_t end = seq + 1;
- if (sequenceStartPositions) {
- begin = sequenceStartPositions->getElement(seq);
- end = sequenceStartPositions->getElement(seq + 1);
- }
- for (int pos = begin; pos < end; ++pos) {
- DataSample sample;
- makeSample(arguments, pos, pos == begin, &sample, useGpu);
- CHECK(writer->write(sample));
- ++numWritten;
- }
- }
-
- writer.reset(nullptr);
- os.close();
- }
- CHECK_EQ(arguments[0].getBatchSize(), numWritten);
-}
-
-// check that the sample at pos1 in args1 is same as the sample at pos2 in args2
-void checkSample(const vector& args1,
- int64_t pos1,
- const vector& args2,
- int64_t pos2,
- bool useGpu) {
- EXPECT_EQ(args1.size(), args2.size());
- VLOG(1) << " pos1=" << pos1 << " pos2=" << pos2;
-
- for (size_t i = 0; i < args1.size(); ++i) {
- auto type = getSlotType(args1[i]);
- int dim = getSlotDim(args1[i]);
- EXPECT_EQ(type, getSlotType(args2[i]));
- if (type == SlotDef::INDEX) {
- EXPECT_GE(dim, getSlotDim(args2[i]));
- } else {
- EXPECT_EQ(dim, getSlotDim(args2[i]));
- }
- switch (type) {
- case SlotDef::VECTOR_DENSE: {
- for (int j = 0; j < dim; ++j) {
- EXPECT_EQ(static_cast(args1[i].value->getElement(pos1, j)),
- static_cast(args2[i].value->getElement(pos2, j)));
- }
- break;
- }
- case SlotDef::INDEX: {
- EXPECT_EQ(args1[i].ids->get(pos1), args2[i].ids->get(pos2));
- break;
- }
- case SlotDef::VECTOR_SPARSE_NON_VALUE:
- case SlotDef::VECTOR_SPARSE_VALUE: {
- int colNum1, colNum2;
- const int *rowCols1, *rowCols2;
- const real *rowValues1, *rowValues2;
- getColRow(args1[i], pos1, useGpu, &colNum1, &rowCols1, &rowValues1);
- getColRow(args2[i], pos2, useGpu, &colNum2, &rowCols2, &rowValues2);
- EXPECT_EQ(colNum1, colNum2);
- for (int j = 0; j < colNum1; ++j) {
- EXPECT_EQ(rowCols1[j], rowCols2[j]);
- if (type == SlotDef::VECTOR_SPARSE_VALUE) {
- EXPECT_EQ(rowValues1[j], rowValues2[j]);
- }
- }
- break;
- }
- case SlotDef::VAR_MDIM_DENSE:
- case SlotDef::VAR_MDIM_INDEX: {
- LOG(FATAL) << "Not implemented";
- break;
- }
- case SlotDef::STRING: {
- EXPECT_EQ((*args1[i].strs)[pos1], (*args2[i].strs)[pos2]);
- break;
- }
- }
- }
-}
-
-void testProtoDataProvider(int* numPerSlotType,
- bool iid,
- bool async,
- bool useGpu,
- bool dataCompression,
- int numConstantSlots = 0) {
- mkDir(kTestDir);
- DataBatch data;
-
- prepareData(&data, numPerSlotType, iid, useGpu);
- writeData(data, useGpu, dataCompression);
-
- DataConfig config;
- config.set_type("proto");
- config.set_files(dataCompression ? kProtoFileListCompressed : kProtoFileList);
- config.set_async_load_data(async);
-
- for (int i = 0; i < numConstantSlots; ++i) {
- config.add_constant_slots(i + 11);
- MatrixPtr w = Matrix::create(data.getSize(),
- 1,
- /* trans= */ false,
- /* useGpu= */ false);
- w->assign(config.constant_slots(i));
- data.appendData(w);
- }
-
- unique_ptr dataProvider(DataProvider::create(config, useGpu));
- dataProvider->setSkipShuffle();
-
- EXPECT_EQ(data.getSize(), dataProvider->getSize());
-
- int64_t batchSize = 10;
- DataBatch batch;
-
- size_t seq1 = 0;
- vector& args1 = data.getStreams();
- ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
-
- dataProvider->reset();
-
- while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
- CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
- vector& args2 = batch.getStreams();
- ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
- for (auto& arg : args2) {
- EXPECT_EQ(iid, !arg.sequenceStartPositions);
- }
- size_t numSeqs = batch.getNumSequences();
- VLOG(1) << "numSeqs=" << numSeqs;
- for (size_t seq2 = 0; seq2 < numSeqs; ++seq1, ++seq2) {
- int64_t begin1 = seq1;
- int64_t end1 = seq1 + 1;
- if (sequenceStartPositions1) {
- begin1 = sequenceStartPositions1->getElement(seq1);
- end1 = sequenceStartPositions1->getElement(seq1 + 1);
- EXPECT_LT(seq1, sequenceStartPositions1->getSize() - 1);
- }
-
- int64_t begin2 = seq2;
- int64_t end2 = seq2 + 1;
- if (sequenceStartPositions2) {
- begin2 = sequenceStartPositions2->getElement(seq2);
- end2 = sequenceStartPositions2->getElement(seq2 + 1);
- }
- VLOG(1) << " begin1=" << begin1 << " end1=" << end1
- << " begin2=" << begin2 << " end2=" << end2;
- EXPECT_EQ(end1 - begin1, end2 - begin2);
- for (int i = 0; i < end1 - begin1; ++i) {
- checkSample(args1, begin1 + i, args2, begin2 + i, useGpu);
- }
- }
- }
-
- EXPECT_EQ(seq1, (size_t)data.getNumSequences());
- rmDir(kTestDir);
-}
-
-TEST(ProtoDataProvider, test) {
- int numSlotsArray[] = {0, 3};
- int numTwoArray[] = {0, 1};
- int numSlotsArraySize = sizeof(numSlotsArray) / sizeof(numSlotsArray[0]);
- const int numSlot = 5;
- int combination[numSlot] = {0};
- int k = numSlot - 1;
- while (k >= 0) {
- int numDenseVecSlots = numSlotsArray[combination[0]];
- int numSparseNonValueVecSlots = numSlotsArray[combination[1]];
- int numSparseValueVectorSlots = numSlotsArray[combination[2]];
- int numStrSlots = numSlotsArray[combination[3]];
- int numIdSlots = numSlotsArray[combination[4]];
- // while loop : traverse all cases
- k = numSlot - 1;
- while (k >= 0) {
- if (combination[k] < (numSlotsArraySize - 1)) {
- ++combination[k];
- break;
- } else {
- combination[k] = 0;
- --k;
- }
- }
- if (numDenseVecSlots + numSparseNonValueVecSlots +
- numSparseValueVectorSlots + numStrSlots + numIdSlots <
- 1)
- continue;
- for (int iid : numTwoArray) {
- for (int async : numTwoArray) {
- for (int useGpu : numTwoArray) {
- for (int dataCompression : numTwoArray) {
- if (async && useGpu) {
- // Currently in async mode, useGpu is not supported
- continue;
- }
-#ifndef PADDLE_WITH_CUDA
- if (useGpu) {
- continue;
- }
-#endif
- LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
- << " numSparseNonValueVecSlots="
- << numSparseNonValueVecSlots
- << " numSparseValueVectorSlots="
- << numSparseValueVectorSlots
- << " numStrSlots=" << numStrSlots
- << " numIdSlots=" << numIdSlots << " iid=" << iid
- << " async=" << async << " useGpu=" << useGpu
- << " dataCompression=" << dataCompression;
- int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
- numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
- numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
- numSparseNonValueVecSlots;
- numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] =
- numSparseValueVectorSlots;
- numPerSlotType[SlotDef::INDEX] = numIdSlots;
- numPerSlotType[SlotDef::STRING] = numStrSlots;
- testProtoDataProvider(
- numPerSlotType, iid, async, useGpu, dataCompression);
- } // end for (int dataCompression : numTwoArray)
- } // end for (int useGpu : numTwoArray)
- } // end for (int async : numTwoArray)
- } // end for (int iid : numTwoArray)
- } // end for (while, traverse all slots)
-}
-
-TEST(ProtoDataProvider, constant_slots) {
- int numSlotsArray[] = {0, 3};
- int numTwoArray[] = {0, 1};
- for (int numDenseVecSlots : numSlotsArray) {
- for (int numSparseNonValueVecSlots : numSlotsArray) {
- if (numDenseVecSlots + numSparseNonValueVecSlots < 1) continue;
- for (int numConstantSlots : {1, 2}) {
- for (int useGpu : numTwoArray) {
- for (int dataCompression : numTwoArray) {
-#ifndef PADDLE_WITH_CUDA
- if (useGpu) {
- continue;
- }
-#endif
- LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
- << " numSparseNonValueVecSlots="
- << numSparseNonValueVecSlots
- << " numConstantSlogs=" << numConstantSlots
- << " useGpu=" << useGpu
- << " dataCompression=" << dataCompression;
- int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
- numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
- numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
- numSparseNonValueVecSlots;
- numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] = 1;
- numPerSlotType[SlotDef::INDEX] = 1;
- testProtoDataProvider(numPerSlotType,
- /* iid= */ true,
- /* async= */ false,
- useGpu,
- dataCompression,
- numConstantSlots);
- } // end for (int dataCompression : numTwoArray)
- } // end for (int useGpu : numTwoArray)
- } // end for (int numConstantSlots : {1, 2})
- } // end for (int numSparseNonValueVecSlots : numSlotsArray)
- } // end for (int numDenseVecSlots : numSlotsArray)
-}
-
-void checkSampleSequence(const vector& args1,
- const vector& args2,
- int64_t offset,
- int64_t numSeqs,
- bool useGpu) {
- // check slot num are equal
- EXPECT_EQ(args1.size(), args2.size());
- for (size_t i = 0; i < args1.size(); i++) {
- auto type = getSlotType(args1[i]);
- // check for args2: sequenceStartPositions vs numSeqs
- // (1) size
- EXPECT_EQ(args2[i].sequenceStartPositions->getSize(), (size_t)numSeqs + 1);
- // (2) content
- auto checkArgContent = [&](const Argument& args, int numSeqs) {
- for (int j = 0; j <= numSeqs; j++) {
- int start_pos = args.sequenceStartPositions->getElement(j);
- EXPECT_EQ(start_pos, j);
- }
- };
- switch (type) {
- case SlotDef::INDEX: {
- // args1: for label
- checkArgContent(args2[i], numSeqs);
- // check for args2: ids are equal to args1[offset]
- // (1) size
- EXPECT_EQ(args2[i].ids->getSize(), (size_t)numSeqs);
- // (2) content
- for (int j = 0; j < numSeqs; j++) {
- EXPECT_EQ(args2[i].ids->get(j), args1[i].ids->get(offset + j));
- }
- break;
- }
- case SlotDef::VECTOR_SPARSE_NON_VALUE: {
- // args1: for sparse_non_value
- // args2 should put sparse indexes in ids
- int colNum1;
- const int* rowCols1;
- const real* rowValues1; // nullptr
- int totalLength = 0;
- for (int j = 0; j < numSeqs; j++) {
- getColRow(
- args1[i], offset + j, useGpu, &colNum1, &rowCols1, &rowValues1);
- // (1) lengths
- EXPECT_EQ(totalLength,
- args2[i].sequenceStartPositions->getElement(j));
- EXPECT_EQ(totalLength,
- args2[i].subSequenceStartPositions->getElement(j));
- // (2) content
- for (int k = 0; k < colNum1; k++) {
- EXPECT_EQ(rowCols1[k], args2[i].ids->get(totalLength + k));
- }
- totalLength += colNum1;
- if (colNum1 == 0) {
- // special case here: we will put a "-1" into ids when column num is
- // zero. see ProtoSequenceDataProvider::getNextBatchInternal.
- EXPECT_EQ(-1, args2[i].ids->get(totalLength));
- totalLength++;
- }
- }
- EXPECT_EQ(totalLength,
- args2[i].sequenceStartPositions->getElement(numSeqs));
- EXPECT_EQ(totalLength,
- args2[i].subSequenceStartPositions->getElement(numSeqs));
- break;
- }
- case SlotDef::VECTOR_DENSE: {
- // args1: for dense vector
- checkArgContent(args2[i], numSeqs);
- // check for args2: values are equal to args1[offset]
- // (1) size
- EXPECT_EQ(args2[i].value->getHeight(), (size_t)numSeqs);
- EXPECT_EQ(args2[i].value->getWidth(), (size_t)getSlotDim(args1[i]));
- // (2) content
- for (int j = 0; j < numSeqs; j++) {
- for (size_t k = 0; k < args2[i].value->getWidth(); k++) {
- EXPECT_EQ(
- static_cast(args1[i].value->getElement(j + offset, k)),
- static_cast(args2[i].value->getElement(j, k)));
- }
- }
- break;
- }
- default: { EXPECT_EQ(true, false) << "should not reach here"; }
- }
- }
-}
-
-void testProtoSequenceDataProvider(int* numPerSlotType,
- bool async,
- bool useGpu) {
- mkDir(kTestDir);
- DataBatch data;
-
- prepareData(&data,
- numPerSlotType,
- /* iid */ true,
- useGpu);
- writeData(data, useGpu, /* dataCompression */ false);
-
- DataConfig config;
- config.set_type("proto_sequence");
- config.set_files(kProtoFileList);
- config.set_async_load_data(async);
-
- unique_ptr dataProvider(DataProvider::create(config, useGpu));
- dataProvider->setSkipShuffle();
-
- EXPECT_EQ(data.getSize(), dataProvider->getSize());
-
- int64_t batchSize = 10;
- DataBatch batch;
-
- vector& args1 = data.getStreams();
- ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
-
- dataProvider->reset();
-
- size_t args1Offset = 0;
- while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
- CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
- vector& args2 = batch.getStreams();
- ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
- for (auto& arg : args1) {
- // args1 should not has sequence
- EXPECT_EQ(true, !arg.sequenceStartPositions);
- }
- for (auto& arg : args2) {
- // args2 should has sequence
- EXPECT_NE(true, !arg.sequenceStartPositions);
- }
- size_t numSeqs = batch.getNumSequences();
- checkSampleSequence(args1, args2, args1Offset, numSeqs, useGpu);
- args1Offset += numSeqs;
- }
-
- EXPECT_EQ(args1Offset, (size_t)data.getNumSequences());
- rmDir(kTestDir);
-}
-
-TEST(ProtoSequenceDataProvider, test) {
- int numSlotsArray[] = {0, 3};
- int numTwoArray[] = {0, 1};
- for (int numSparseNonValueVecSlots : numSlotsArray) {
- for (int numIdSlots : numSlotsArray) {
- for (int numDenseVecSlots : numSlotsArray) {
- if (numDenseVecSlots + numSparseNonValueVecSlots + numIdSlots < 1)
- continue;
- for (int async : numTwoArray) {
- for (int useGpu : numTwoArray) {
- if (async && useGpu) {
- // Currently in async mode, useGpu is not supported
- continue;
- }
-#ifndef PADDLE_WITH_CUDA
- if (useGpu) {
- continue;
- }
-#endif
- LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
- << " numSparseNonValueVecSlots="
- << numSparseNonValueVecSlots
- << " numIdSlots=" << numIdSlots << " async=" << async
- << " useGpu=" << useGpu;
- int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
- numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
- numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
- numSparseNonValueVecSlots;
- numPerSlotType[SlotDef::INDEX] = numIdSlots;
- testProtoSequenceDataProvider(numPerSlotType, async, useGpu);
- } // end for (int useGpu : numTwoArray)
- } // end for (int async : numTwoArray)
- } // end for (int numDenseVecSlots : numSlotsArray)
- } // end for (int numIdSlots : numSlotsArray)
- } // end for (int numSparseNonValueVecSlots : numSlotsArray)
-}
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 4adaaef9838f0d178468af3af142031325bfc11d..a2ef731ecbcd18ca4bd0b2381de04650a2686c2d 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -17,9 +17,13 @@ limitations under the License. */
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h"
+#ifndef PADDLE_MOBILE_INFERENCE
DEFINE_int32(pool_limit_size,
536870912,
"maximum memory size managed by a memory pool, default is 512M");
+#else
+DEFINE_int32(pool_limit_size, 0, "default is 0");
+#endif
namespace paddle {
diff --git a/paddle/memory/README.md b/paddle/memory/README.md
index 7f95e80f980b0c0b93ecb418e6b923045313eaa5..6cb003c50bc7d142d65b0591e7e5235431d2ea42 100644
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
@@ -1,4 +1,141 @@
# Region-based Heterogeneous Memory Management
+## Design
-Please check out the [design documentation](http://gangliao.me) to find out more details about
-buddy memory allocator for both CPU and GPU.
+### Usage
+
+To allocate 4KB CPU memory:
+
+```cpp
+p = memory::Alloc(platform::CPUPlace(), 4*1024);
+```
+
+To allocate 4KB memory on the 3rd GPU:
+
+```cpp
+p = memory::Alloc(platform::GPUPlace(2), 4*1024);
+```
+
+To free memory and check the so-far used amount of memory on a place:
+
+```cpp
+auto pl = platform::GPUPlace(0);
+p = memory::Alloc(pl, 4*1024);
+cout << memory::Used(pl);
+memory::Free(pl, p);
+```
+
+### API
+
+In `paddle/memory/memory.h` we have:
+
+```cpp
+namespace memory {
+template void* Alloc(Place, size_t);
+template void Free(Place, void*);
+template size_t Used(Place);
+} // namespace memory
+```
+
+These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
+
+```cpp
+template<>
+void* Alloc(CPUPlace p, size_t size) {
+ return GetCPUBuddyAllocator()->Alloc(size);
+}
+```
+
+and
+
+```cpp
+template<>
+void Alloc(GPUPlace p, size_t size) {
+ return GetGPUBuddyAllocator(p.id)->Alloc(size);
+}
+```
+
+Similar specializations exist for `Free` and `Used`.
+
+### Implementation
+
+`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
+
+```cpp
+BuddyAllocator* GetCPUBuddyAllocator() {
+ static BuddyAllocator* a = NULL;
+ if (a == NULL) {
+ a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
+ }
+ return a;
+}
+
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+ static BuddyAllocator* as = NULL;
+ if (as == NULL) {
+ as = new BuddyAllocator*[platform::NumGPUs()];
+ for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
+ as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
+ }
+ }
+ return as[gpu_id);
+```
+
+#### `BuddyAllocator`
+
+`BuddyAllocator` implements the buddy allocation algorithm. Its constructor takes parameters only related with the algorithm:
+
+```cpp
+BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
+ ...
+}
+```
+
+Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
+
+```cpp
+class BuddyAllocator {
+ private:
+ struct Block {
+ size_t size;
+ Block* left, right;
+ size_t index; // allocator id
+ };
+ ...
+};
+```
+
+Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`. Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
+
+#### System Allocators
+
+The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They work as the fallback allocators of `BuddyAllocator`.
+
+## Justification
+
+I got inspiration from Majel and Caffe2, though above design look different from both.
+
+### Caffe2
+
+In Caffe2, `Tensor::mutable_data()` allocates the memroy. In particular, [`Tensor::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
+
+There are two implementations of `Context`:
+
+1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
+
+1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
+
+### Majel
+
+In Majel, there are basically two allocator types:
+
+1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
+1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
+
+However, memory allocation is not via these two allocators. Instead, these two allocators are defined in hidden namespaces.
+
+In Majel there are hidden global variables like:
+
+1. `cpu::SystemAllocator g_cpu_allocator`, and
+1. `vector g_gpu_allocators(NUM_GPUS)`.
+
+Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 709f7de2e43093114d096cbfca5b5d49293a6d3e..a719da2560291dbc7e98aadfae41d4692d8afcad 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -9,6 +9,7 @@ function(op_library TARGET)
set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
set(cc_srcs)
set(cu_srcs)
+ set(cu_cc_srcs)
set(op_common_deps operator op_registry math_function)
set(options "")
set(oneValueArgs "")
@@ -22,6 +23,9 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND cc_srcs ${TARGET}.cc)
endif()
+ if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+ list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
+ endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu)
endif()
@@ -29,6 +33,8 @@ function(op_library TARGET)
foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src})
+ elseif(${src} MATCHES ".*\\.cu.cc$")
+ list(APPEND cu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src})
else()
@@ -43,7 +49,7 @@ function(op_library TARGET)
endif()
if (WITH_GPU)
- nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+ nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
else()
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
@@ -140,7 +146,9 @@ function(op_library TARGET)
# pybind USE_CPU_ONLY_OP
list(LENGTH cu_srcs cu_srcs_len)
- if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0)
+ list(LENGTH cu_cc_srcs cu_cc_srcs_len)
+
+ if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1)
endif()
@@ -160,11 +168,12 @@ set(DEPS_OPS
recurrent_op
dynamic_recurrent_op
softmax_with_cross_entropy_op
+ softmax_op
+ sequence_softmax_op
sum_op
pool_op
pool_with_index_op
conv_op
- lstm_op
conv_transpose_op
nccl_op
sequence_conv_op
@@ -174,13 +183,20 @@ set(DEPS_OPS
array_to_lod_tensor_op
lstm_op
tensor_array_read_write_op
- gru_op)
+ gru_op
+ adagrad_op
+ sgd_op)
+
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
+op_library(softmax_op DEPS softmax)
+op_library(sequence_softmax_op DEPS softmax)
+op_library(sum_op DEPS selected_rows_functor)
+op_library(sgd_op DEPS selected_rows_functor)
+op_library(adagrad_op DEPS selected_rows_functor)
op_library(conv_op DEPS vol2col)
-op_library(sum_op DEPS net_op selected_rows_functor)
op_library(pool_op DEPS pooling)
op_library(pool_with_index_op DEPS pooling)
op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
@@ -220,6 +236,6 @@ cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc
rnn/recurrent_op_utils.cc
DEPS dynamic_recurrent_op)
if(WITH_GPU)
- nv_test(nccl_op_test SRCS nccl_op_test.cu DEPS nccl_op gpu_info device_context)
+ cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
index 8d1a2b7938d2c6607cbeb3cecb72d1d5b83dd8b9..d6686e3ef3165976cf4c077a7a0f213082aa7716 100644
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -14,6 +14,11 @@ limitations under the License. */
#include "paddle/operators/adagrad_op.h"
+#include
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+
namespace paddle {
namespace operators {
@@ -21,7 +26,7 @@ class AdagradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
- void InferShape(framework::InferShapeContext *ctx) const override {
+ void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"),
"Input(Param) of AdagradOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"),
@@ -54,8 +59,8 @@ class AdagradOp : public framework::OperatorWithKernel {
class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
public:
- AdagradOpMaker(framework::OpProto *proto,
- framework::OpAttrChecker *op_checker)
+ AdagradOpMaker(framework::OpProto* proto,
+ framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Param", "(Tensor) Input parameter");
AddInput("Grad", "(Tensor) Input gradient");
@@ -87,10 +92,85 @@ for numerical stability to avoid the division by zero error.
)DOC");
}
};
+
+namespace {
+size_t FindPos(const std::vector& rows, int64_t value) {
+ return std::find(rows.begin(), rows.end(), value) - rows.begin();
+}
+} // namespace
+
+template
+struct SparseAdagradFunctor {
+ void operator()(const platform::DeviceContext& context,
+ const framework::SelectedRows& grad,
+ const framework::Tensor& learning_rate, T epsilon,
+ framework::Tensor* moment, framework::Tensor* param) {
+ // 1. g_m.rows = set(g.rows)
+ auto grad_rows = grad.rows();
+ std::set row_set(grad_rows.begin(), grad_rows.end());
+ std::vector merge_rows(row_set.begin(), row_set.end());
+
+ auto grad_width = grad.value().dims()[1];
+ std::unique_ptr grad_merge{
+ new framework::SelectedRows()};
+ grad_merge->set_rows(merge_rows);
+ grad_merge->set_height(grad.height());
+ grad_merge->mutable_value()->mutable_data(
+ framework::make_ddim(
+ {static_cast(merge_rows.size()), grad_width}),
+ context.GetPlace());
+
+ math::SetConstant constant_functor;
+ constant_functor(context, grad_merge->mutable_value(), 0.0);
+
+ auto* grad_merge_data = grad_merge->mutable_value()->data();
+ auto* grad_data = grad.value().data();
+
+ for (size_t i = 0; i < grad_rows.size(); i++) {
+ size_t grad_merge_i = FindPos(merge_rows, grad_rows[i]);
+ for (int64_t j = 0; j < grad_width; j++) {
+ grad_merge_data[grad_merge_i * grad_width + j] +=
+ grad_data[i * grad_width + j];
+ }
+ }
+
+ // 2. m += g_m * g_m
+ std::unique_ptr grad_square{
+ new framework::SelectedRows()};
+ grad_square->set_rows(grad_merge->rows());
+ grad_square->set_height(grad_merge->height());
+ grad_square->mutable_value()->mutable_data(grad_merge->value().dims(),
+ context.GetPlace());
+ auto gs =
+ framework::EigenVector::Flatten(*(grad_square->mutable_value()));
+ auto gm = framework::EigenVector::Flatten(grad_merge->value());
+ gs.device(*context.GetEigenDevice()) = gm * gm;
+
+ math::SelectedRowsAddToTensor functor;
+ functor(context, *grad_square, moment);
+
+ // 3. update parameter
+ auto* lr = learning_rate.data();
+ auto* param_data = param->data();
+ auto* moment_data = moment->data();
+
+ for (size_t i = 0; i < merge_rows.size(); i++) {
+ for (int64_t j = 0; j < grad_width; j++) {
+ param_data[merge_rows[i] * grad_width + j] -=
+ lr[0] * grad_merge_data[i * grad_width + j] /
+ (std::sqrt(moment_data[merge_rows[i] * grad_width + j]) + epsilon);
+ }
+ }
+ }
+};
+
+template struct SparseAdagradFunctor;
+template struct SparseAdagradFunctor;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(adagrad,
- ops::AdagradOpKernel);
+REGISTER_OP_CPU_KERNEL(
+ adagrad, ops::AdagradOpKernel,
+ ops::AdagradOpKernel);
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
index a5b7951121360f78612f9008a522235104708112..5b869e6bc5f4604ba6055ffd62fa21e4a1f41b93 100644
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -14,7 +14,138 @@
#define EIGEN_USE_GPU
#include "paddle/operators/adagrad_op.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+namespace {
+
+template
+__global__ void MergeGradKernel(const T* grad, const int64_t* grad_rows,
+ T* grad_merge, const int64_t* grad_merge_rows,
+ size_t grad_merge_rows_size,
+ int64_t row_numel) {
+ const int ty = blockIdx.y;
+ int tid = threadIdx.x;
+ __shared__ size_t grad_merge_idx;
+
+ if (tid == 0) {
+ for (size_t i = 0; i < grad_merge_rows_size; i++) {
+ if (grad_rows[ty] == grad_merge_rows[i]) {
+ grad_merge_idx = i;
+ }
+ }
+ }
+
+ __syncthreads();
+
+ grad += ty * row_numel;
+ grad_merge += grad_merge_idx * row_numel;
+ for (int index = tid; index < row_numel; index += block_size) {
+ paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]);
+ }
+}
+
+template
+__global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
+ const T* learning_rate, T* param,
+ T* moment, int64_t row_numel,
+ T epsilon) {
+ const int ty = blockIdx.y;
+ int tid = threadIdx.x;
+
+ grad += ty * row_numel;
+ param += rows[ty] * row_numel;
+ moment += rows[ty] * row_numel;
+
+ for (int index = tid; index < row_numel; index += block_size) {
+ // Since index in rows of SelectedRows can be duplicate, we have to use
+ // Atomic Operation to avoid concurrent write error.
+ paddle::platform::CudaAtomicAdd(param + index,
+ -1.0 * learning_rate[0] * grad[index] /
+ (sqrt(moment[index]) + epsilon));
+ }
+}
+} // namespace
+
+template
+struct SparseAdagradFunctor {
+ void operator()(const platform::DeviceContext& context,
+ const framework::SelectedRows& grad,
+ const framework::Tensor& learning_rate, T epsilon,
+ framework::Tensor* moment, framework::Tensor* param) {
+ // 1. g_m.rows = set(g.rows)
+ auto grad_rows = grad.rows();
+ std::set row_set(grad_rows.begin(), grad_rows.end());
+ std::vector merge_rows(row_set.begin(), row_set.end());
+
+ auto grad_width = grad.value().dims()[1];
+ std::unique_ptr grad_merge{
+ new framework::SelectedRows()};
+ grad_merge->set_rows(merge_rows);
+ grad_merge->set_height(grad.height());
+ grad_merge->mutable_value()->mutable_data(
+ framework::make_ddim(
+ {static_cast(merge_rows.size()), grad_width}),
+ context.GetPlace());
+
+ math::SetConstant constant_functor;
+ constant_functor(context, grad_merge->mutable_value(), 0.0);
+
+ auto* grad_merge_data = grad_merge->mutable_value()->data();
+ auto* grad_data = grad.value().data();
+
+ const int block_size = 256;
+ dim3 threads(block_size, 1);
+ dim3 grid1(1, grad_rows.size());
+
+ MergeGradKernel<
+ T, 256><<(context)
+ .stream()>>>(grad_data, grad.rows().data(),
+ grad_merge_data, grad_merge->rows().data(),
+ grad_merge->rows().size(), grad_width);
+
+ // 2. m += g_m * g_m
+ std::unique_ptr grad_square{
+ new framework::SelectedRows()};
+ grad_square->set_rows(grad_merge->rows());
+ grad_square->set_height(grad_merge->height());
+ grad_square->mutable_value()->mutable_data(grad_merge->value().dims(),
+ context.GetPlace());
+ auto gs =
+ framework::EigenVector::Flatten(*(grad_square->mutable_value()));
+ auto gm = framework::EigenVector::Flatten(grad_merge->value());
+ gs.device(*context.GetEigenDevice()) = gm * gm;
+
+ math::SelectedRowsAddToTensor functor;
+ functor(context, *grad_square, moment);
+
+ // 3. update parameter
+ auto* lr = learning_rate.data();
+ auto* param_data = param->data();
+ auto* moment_data = moment->data();
+
+ dim3 grid2(1, merge_rows.size());
+ SparseAdagradFunctorKernel<
+ T, 256><<(context)
+ .stream()>>>(grad_merge_data, grad_merge->rows().data(),
+ lr, param_data,
+ moment_data, grad_width, epsilon);
+ }
+};
+
+template struct SparseAdagradFunctor;
+template struct SparseAdagradFunctor;
+
+} // namespace operators
+} // namespace paddle
namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(adagrad,
- ops::AdagradOpKernel);
+REGISTER_OP_GPU_KERNEL(
+ adagrad, ops::AdagradOpKernel,
+ ops::AdagradOpKernel);
diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h
index c5d8f751d3527f89b96d4274328ba0bb5f6efa44..4d4a6434c7c472d8ceb01edfc4050fbb009d6c9f 100644
--- a/paddle/operators/adagrad_op.h
+++ b/paddle/operators/adagrad_op.h
@@ -19,35 +19,59 @@ limitations under the License. */
namespace paddle {
namespace operators {
+template
+struct SparseAdagradFunctor {
+ void operator()(const platform::DeviceContext& context,
+ const framework::SelectedRows& grad,
+ const framework::Tensor& learning_rate, T epsilon,
+ framework::Tensor* moment, framework::Tensor* param);
+};
+
template
class AdagradOpKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
- auto param_out_tensor = ctx.Output("ParamOut");
- auto moment_out_tensor = ctx.Output("MomentOut");
+ auto* param_out_tensor = ctx.Output("ParamOut");
+ auto* moment_out_tensor = ctx.Output("MomentOut");
param_out_tensor->mutable_data(ctx.GetPlace());
moment_out_tensor->mutable_data(ctx.GetPlace());
- float epsilon = ctx.Attr("epsilon");
-
- auto param = framework::EigenVector::Flatten(
- *ctx.Input("Param"));
- auto grad = framework::EigenVector::Flatten(
- *ctx.Input("Grad"));
- auto moment = framework::EigenVector::Flatten(
- *ctx.Input("Moment"));
- auto lr = framework::EigenVector::Flatten(
- *ctx.Input("LearningRate"));
-
- auto param_out = framework::EigenVector::Flatten(*param_out_tensor);
- auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor);
- auto place = ctx.GetEigenDevice();
-
- moment_out.device(place) = moment + grad * grad;
- Eigen::DSizes m_dsize(moment_out_tensor->numel());
- param_out.device(place) =
- param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+ T epsilon = static_cast(ctx.Attr