提交 5d2b3fb0 编写于 作者: F fengjiayi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_add_FillZerosLikeOp_test

...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
description: Format files with ClangFormat. description: Format files with ClangFormat.
entry: clang-format -i entry: clang-format -i
language: system language: system
files: \.(c|cc|cxx|cpp|h|hpp|hxx)$ files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang - repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks: hooks:
......
...@@ -36,8 +36,8 @@ include(simd) ...@@ -36,8 +36,8 @@ include(simd)
################################ Configurations ####################################### ################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." OFF) option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND})
option(WITH_MKLML "Compile PaddlePaddle with mklml package." OFF) option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
......
...@@ -27,13 +27,16 @@ RUN apt-get update && \ ...@@ -27,13 +27,16 @@ RUN apt-get update && \
git python-pip python-dev openssh-server bison \ git python-pip python-dev openssh-server bison \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \ curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-numpy python-matplotlib gcc-4.8 g++-4.8 \ python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format-3.8 swig doxygen cmake \ automake locales clang-format-3.8 swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \ liblapack-dev liblapacke-dev libboost-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools && \ net-tools && \
apt-get clean -y apt-get clean -y
# paddle is using numpy.flip, which is introduced since 1.12.0
RUN pip --no-cache-dir install 'numpy>=1.12.0'
# Install Go and glide # Install Go and glide
RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \ RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go.tgz && \ tar -C /usr/local -xzf go.tgz && \
......
...@@ -74,8 +74,6 @@ if(WITH_MKLDNN) ...@@ -74,8 +74,6 @@ if(WITH_MKLDNN)
set(OPENMP_FLAGS "-fopenmp") set(OPENMP_FLAGS "-fopenmp")
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
else() else()
......
...@@ -42,26 +42,21 @@ macro(add_style_check_target TARGET_NAME) ...@@ -42,26 +42,21 @@ macro(add_style_check_target TARGET_NAME)
if(WITH_STYLE_CHECK) if(WITH_STYLE_CHECK)
set(SOURCES_LIST ${ARGN}) set(SOURCES_LIST ${ARGN})
list(REMOVE_DUPLICATES SOURCES_LIST) list(REMOVE_DUPLICATES SOURCES_LIST)
list(SORT SOURCES_LIST)
foreach(filename ${SOURCES_LIST}) foreach(filename ${SOURCES_LIST})
set(LINT ON)
foreach(pattern ${IGNORE_PATTERN}) foreach(pattern ${IGNORE_PATTERN})
if(filename MATCHES ${pattern}) if(filename MATCHES ${pattern})
message(STATUS "DROP LINT ${filename}") list(REMOVE_ITEM SOURCES_LIST ${filename})
set(LINT OFF)
endif() endif()
endforeach() endforeach()
if(LINT MATCHES ON)
# cpplint code style
get_filename_component(base_filename ${filename} NAME)
set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
"--filter=${STYLE_FILTER}"
"--write-success=${CUR_GEN}" ${filename}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endforeach() endforeach()
if(SOURCES_LIST)
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
"--filter=${STYLE_FILTER}"
${SOURCES_LIST}
COMMENT "cpplint: Checking source code style"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endif() endif()
endmacro() endmacro()
...@@ -7,7 +7,7 @@ INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any) ...@@ -7,7 +7,7 @@ INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
ExternalProject_Add( ExternalProject_Add(
extern_lib_any extern_lib_any
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/thelink2012/any.git" GIT_REPOSITORY "https://github.com/PaddlePaddle/any.git"
GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020" GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020"
PREFIX ${ANY_SOURCE_DIR} PREFIX ${ANY_SOURCE_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
......
...@@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) ...@@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
ExternalProject_Add( ExternalProject_Add(
extern_gflags extern_gflags
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/gflags/gflags.git" # TODO(yiwang): The annoying warnings mentioned in
# https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
# gflags. I fired a PR https://github.com/gflags/gflags/pull/230
# to fix it. Before it gets accepted by the gflags team, we use
# my personal fork, which contains above fix, temporarily. Let's
# change this back to the official Github repo once my PR is
# merged.
GIT_REPOSITORY "https://github.com/wangkuiyi/gflags.git"
PREFIX ${GFLAGS_SOURCES_DIR} PREFIX ${GFLAGS_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
...@@ -69,8 +69,13 @@ ENDIF(NOT ${CBLAS_FOUND}) ...@@ -69,8 +69,13 @@ ENDIF(NOT ${CBLAS_FOUND})
MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
ADD_LIBRARY(cblas STATIC IMPORTED) # FIXME(gangliao): generate cblas target to track all high performance
SET_PROPERTY(TARGET cblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES}) # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
ADD_LIBRARY(cblas STATIC ${dummyfile})
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
IF(NOT ${CBLAS_FOUND}) IF(NOT ${CBLAS_FOUND})
ADD_DEPENDENCIES(cblas extern_openblas) ADD_DEPENDENCIES(cblas extern_openblas)
LIST(APPEND external_project_dependencies cblas) LIST(APPEND external_project_dependencies cblas)
......
...@@ -24,7 +24,6 @@ IF(WITH_PYTHON) ...@@ -24,7 +24,6 @@ IF(WITH_PYTHON)
ENDIF(WITH_PYTHON) ENDIF(WITH_PYTHON)
SET(py_env "") SET(py_env "")
SET(USE_VIRTUALENV_FOR_TEST 1)
IF(PYTHONINTERP_FOUND) IF(PYTHONINTERP_FOUND)
find_python_module(pip REQUIRED) find_python_module(pip REQUIRED)
find_python_module(numpy REQUIRED) find_python_module(numpy REQUIRED)
......
...@@ -115,7 +115,7 @@ set(COMMON_FLAGS ...@@ -115,7 +115,7 @@ set(COMMON_FLAGS
-Wno-error=literal-suffix -Wno-error=literal-suffix
-Wno-error=sign-compare -Wno-error=sign-compare
-Wno-error=unused-local-typedefs -Wno-error=unused-local-typedefs
-Wno-error=parentheses-equality # Warnings in Pybind11 -Wno-error=parentheses-equality # Warnings in pybind11
) )
set(GPU_COMMON_FLAGS set(GPU_COMMON_FLAGS
...@@ -195,6 +195,7 @@ endif() ...@@ -195,6 +195,7 @@ endif()
# Modern gpu architectures: Pascal # Modern gpu architectures: Pascal
if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0") if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60") list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
endif() endif()
# Custom gpu architecture # Custom gpu architecture
......
...@@ -403,3 +403,16 @@ function(py_proto_compile TARGET_NAME) ...@@ -403,3 +403,16 @@ function(py_proto_compile TARGET_NAME)
protobuf_generate_python(py_srcs ${py_proto_compile_SRCS}) protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs}) add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
endfunction() endfunction()
function(py_test TARGET_NAME)
if(WITH_TESTING)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
python2 ${py_test_SRCS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endfunction()
...@@ -118,7 +118,6 @@ endfunction() ...@@ -118,7 +118,6 @@ endfunction()
macro(add_unittest_without_exec TARGET_NAME) macro(add_unittest_without_exec TARGET_NAME)
add_executable(${TARGET_NAME} ${ARGN}) add_executable(${TARGET_NAME} ${ARGN})
link_paddle_test(${TARGET_NAME}) link_paddle_test(${TARGET_NAME})
add_style_check_target(${TARGET_NAME} ${ARGN})
endmacro() endmacro()
# add_unittest # add_unittest
...@@ -150,9 +149,12 @@ endfunction() ...@@ -150,9 +149,12 @@ endfunction()
# Create a python unittest using run_python_tests.sh, # Create a python unittest using run_python_tests.sh,
# which takes care of making correct running environment # which takes care of making correct running environment
function(add_python_test TEST_NAME) function(add_python_test TEST_NAME)
add_test(NAME ${TEST_NAME} foreach(arg ${ARGN})
COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR} get_filename_component(py_fn ${arg} NAME_WE)
bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh set(TRG_NAME ${TEST_NAME}_${py_fn})
${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN} add_test(NAME ${TRG_NAME}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
python2 ${arg}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endforeach()
endfunction() endfunction()
# Intel® MKL-DNN on PaddlePaddle: Design Doc
我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle,充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。
我们短期内的基本目标是:
- 完成常用layer的MKL-DNN实现。
- 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。
## Contents
- [Overview](#overview)
- [Actions](#actions)
- [CMake](#cmake)
- [Layers](#layers)
- [Activations](#activations)
- [Unit Tests](#unit-tests)
- [Protobuf Messages](#protobuf-messages)
- [Python API](#python-api)
- [Demos](#demos)
- [Benchmarking](#benchmarking)
- [Others](#others)
- [Design Concerns](#design-concerns)
## Overview
我们会把MKL-DNN作为第三方库集成进PaddlePaddle,整体框架图
<div align="center">
<img src="image/overview.png" width=350><br/>
Figure 1. PaddlePaddle on IA.
</div>
## Actions
我们把集成方案大致分为了如下几个方面。
### CMake
我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项,当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
同时,我们会引入`WITH_MKLML`选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。
所以,我们会在`cmake/external`目录新建`mkldnn.cmake``mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。
**备注**:当`WITH_MKLML=ON`的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动`cmake/cblas.cmake`中的逻辑。
### Layers
所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在
`paddle/gserver/layers`中,并且文件名都会一以*Mkldnn*开头。
所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`
### Activations
由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口,实现方法还是会在`ActivationFunction.cpp`文件。
### Unit Tests
会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp``MkldnnTester.*`用于MKL-DNN的测试。
Activation的测试,计划在PaddlePaddle原有的测试文件上直接添加新的测试type。
### Protobuf Messages
根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
### Python API
目前只考虑**v1 API**
计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择,方便用户选择使用MKL-DNN的layers。
具体实现方式比如:
```python
use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
if use_mkldnn
self.layer_type = mkldnn_*
```
所有MKL-DNN的layer type会以*mkldnn_*开头,以示区分。
并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py ``layers.py`里面添加必要的MKL-DNN的接口。
### Demos
会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。
### Benchmarking
会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`,添加使用MKL-DNN的测试。
### Others
1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为64。
2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。
## Design Concerns
为了更好的符合PaddlePaddle的代码风格\[[2](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]
我们总结出一些特别需要注意的点:
1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2``MkldnnLayer`特有的设备ID。
2. 重写父类Layer的**init**函数,修改`deviceId_``-2`,代表这个layer是用于跑在MKL-DNN的环境下。
3. 创建`MkldnnMatrix`,用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
4. 创建`MkldnnBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream``CpuEngine`,和未来可能还会用到`FPGAEngine`等。
5.**Argument**里添加两个`MkldnnMatrixPtr`,取名为`mkldnnValue``mkldnnGrad`,用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名),用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑,用于判断`deviceId`,并针对device在MKL-DNN和CPU之间不统一的情况,做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
## References
1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN")
2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。
3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`,所以不存在这个问题),所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。
...@@ -21,22 +21,15 @@ ...@@ -21,22 +21,15 @@
# #
# It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...} # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
# #
PYPATH=""
if ! python -c "import paddle" >/dev/null 2>/dev/null; then set -x
PYPATH="" while getopts "d:" opt; do
set -x case $opt in
while getopts "d:" opt; do d)
case $opt in PYPATH=$OPTARG
d) ;;
PYPATH=$OPTARG esac
;; done
esac shift $(($OPTIND - 1))
done export PYTHONPATH=$PYPATH:$PYTHONPATH
shift $(($OPTIND - 1)) $@
export PYTHONPATH=$PYPATH:$PYTHONPATH
$@
else
echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'"
exit 1
fi
...@@ -22,7 +22,5 @@ if(WITH_C_API) ...@@ -22,7 +22,5 @@ if(WITH_C_API)
endif() endif()
if(WITH_SWIG_PY) if(WITH_SWIG_PY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
add_subdirectory(api) add_subdirectory(api)
endif() endif()
...@@ -82,9 +82,7 @@ SWIG_LINK_LIBRARIES(swig_paddle ...@@ -82,9 +82,7 @@ SWIG_LINK_LIBRARIES(swig_paddle
add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so
COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle
COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND ${CMAKE_COMMAND} -E touch .timestamp
COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp
COMMAND rm -rf py_paddle.egg-info build
WORKING_DIRECTORY ${PROJ_ROOT}/paddle WORKING_DIRECTORY ${PROJ_ROOT}/paddle
DEPENDS _swig_paddle DEPENDS _swig_paddle
) )
...@@ -92,10 +90,6 @@ add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so ...@@ -92,10 +90,6 @@ add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so
# TODO(yuyang18) : make wheel name calculated by cmake # TODO(yuyang18) : make wheel name calculated by cmake
add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so) add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so)
install(DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/dist/
DESTINATION opt/paddle/share/wheels
)
if(WITH_TESTING) if(WITH_TESTING)
IF(NOT PY_PIP_FOUND) IF(NOT PY_PIP_FOUND)
SET(PIP_SOURCES_DIR ${PYTHON_SOURCES_DIR}/pip) SET(PIP_SOURCES_DIR ${PYTHON_SOURCES_DIR}/pip)
...@@ -108,7 +102,7 @@ if(WITH_TESTING) ...@@ -108,7 +102,7 @@ if(WITH_TESTING)
BUILD_COMMAND "" BUILD_COMMAND ""
INSTALL_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install INSTALL_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
BUILD_IN_SOURCE 1 BUILD_IN_SOURCE 1
DEPENDS python setuptools python_api_wheel #DEPENDS python setuptools python_api_wheel
) )
ENDIF() ENDIF()
add_subdirectory(test) add_subdirectory(test)
......
add_python_test(test_swig_api py_test(testTrain SRCS testTrain.py)
testArguments.py testGradientMachine.py testMatrix.py testVector.py testTrain.py testTrainer.py) py_test(testMatrix SRCS testMatrix.py)
py_test(testVector SRCS testVector.py)
py_test(testTrainer SRCS testTrainer.py)
py_test(testArguments SRCS testArguments.py)
py_test(testGradientMachine SRCS testGradientMachine.py)
...@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_batch_transpose.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_batch_transpose.h"
const int TILE_DIM = 64; const int TILE_DIM = 64;
const int BLOCK_ROWS = 16; const int BLOCK_ROWS = 16;
// No bank-conflict transpose for a batch of data. // No bank-conflict transpose for a batch of data.
__global__ void batchTransposeNoBankConflicts(real* odata, __global__ void batchTransposeNoBankConflicts(
const real* idata, real* odata, const real* idata, int numSamples, int width, int height) {
int numSamples, int width,
int height) {
__shared__ float tile[TILE_DIM][TILE_DIM + 1]; __shared__ float tile[TILE_DIM][TILE_DIM + 1];
const int x = blockIdx.x * TILE_DIM + threadIdx.x; const int x = blockIdx.x * TILE_DIM + threadIdx.x;
...@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata, ...@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
newX] = tile[threadIdx.x][j]; newX] = tile[threadIdx.x][j];
} }
void batchTranspose(const real* input, real* output, int width, int height, void batchTranspose(
int batchSize) { const real* input, real* output, int width, int height, int batchSize) {
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize); dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>> batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
(output, input, batchSize, width, height); output, input, batchSize, width, height);
CHECK_SYNC("batchTranspose failed!"); CHECK_SYNC("batchTranspose failed!");
} }
...@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_aggregate.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_cuda.h" #include "hl_cuda.h"
#include "hl_cuda.ph" #include "hl_cuda.ph"
#include "hl_aggregate.h"
#include "hl_thread.ph"
#include "hl_matrix_base.cuh" #include "hl_matrix_base.cuh"
#include "hl_thread.ph"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
/** /**
* @brief matrix row operator. * @brief matrix row operator.
*/ */
template<class Agg, int blockSize> template <class Agg, int blockSize>
__global__ void KeMatrixRowOp(Agg agg, __global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
real *E,
real *Sum,
int dimN) {
__shared__ real sum_s[blockSize]; __shared__ real sum_s[blockSize];
int cnt = (dimN + blockSize -1) / blockSize; int cnt = (dimN + blockSize - 1) / blockSize;
int rowId = blockIdx.x + blockIdx.y*gridDim.x; int rowId = blockIdx.x + blockIdx.y * gridDim.x;
int index = rowId*dimN; int index = rowId * dimN;
int tid = threadIdx.x; int tid = threadIdx.x;
int lmt = tid; int lmt = tid;
...@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg, ...@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
sum_s[tid] = tmp; sum_s[tid] = tmp;
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]); sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
} }
...@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg, ...@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
} }
template <class Agg> template <class Agg>
void hl_matrix_row_op(Agg agg, void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
real *A_d,
real *C_d,
int dimM,
int dimN) {
int blocksX = dimM; int blocksX = dimM;
int blocksY = 1; int blocksY = 1;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixRowOp<Agg, 128><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimN); agg, A_d, C_d, dimN);
} }
void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) { void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::sum(), hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_sum failed"); CHECK_SYNC("hl_matrix_row_sum failed");
} }
...@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::max(), hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_max failed"); CHECK_SYNC("hl_matrix_row_max failed");
} }
...@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::min(), hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_min failed"); CHECK_SYNC("hl_matrix_row_min failed");
} }
/** /**
* @brief matrix column operator. * @brief matrix column operator.
*/ */
template<class Agg> template <class Agg>
__global__ void KeMatrixColumnOp(Agg agg, __global__ void KeMatrixColumnOp(
real *E, Agg agg, real *E, real *Sum, int dimM, int dimN) {
real *Sum,
int dimM,
int dimN) {
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
real tmp = agg.init(); real tmp = agg.init();
if (rowIdx < dimN) { if (rowIdx < dimN) {
...@@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg, ...@@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg,
} }
} }
template<class Agg, int blockDimX, int blockDimY> template <class Agg, int blockDimX, int blockDimY>
__global__ void KeMatrixColumnOp_S(Agg agg, __global__ void KeMatrixColumnOp_S(
real *E, Agg agg, real *E, real *Sum, int dimM, int dimN) {
real *Sum, __shared__ real _sum[blockDimX * blockDimY];
int dimM, int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int dimN) { int index = threadIdx.y;
__shared__ real _sum[blockDimX*blockDimY];
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int index = threadIdx.y;
real tmp = agg.init(); real tmp = agg.init();
if (rowIdx < dimN) { if (rowIdx < dimN) {
...@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg, ...@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
index += blockDimY; index += blockDimY;
} }
} }
_sum[threadIdx.x + threadIdx.y*blockDimX] = tmp; _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
__syncthreads(); __syncthreads();
if (rowIdx < dimN) { if (rowIdx < dimN) {
if (threadIdx.y ==0) { if (threadIdx.y == 0) {
real tmp = agg.init(); real tmp = agg.init();
for (int i=0; i < blockDimY; i++) { for (int i = 0; i < blockDimY; i++) {
tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]); tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
} }
Sum[rowIdx] = tmp; Sum[rowIdx] = tmp;
} }
...@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg, ...@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
} }
template <class Agg> template <class Agg>
void hl_matrix_column_op(Agg agg, void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
real *A_d,
real *C_d,
int dimM,
int dimN) {
if (dimN >= 8192) { if (dimN >= 8192) {
int blocksX = (dimN + 128 -1) / 128; int blocksX = (dimN + 128 - 1) / 128;
int blocksY = 1; int blocksY = 1;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixColumnOp<Agg><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimM, dimN); agg, A_d, C_d, dimM, dimN);
} else { } else {
int blocksX = (dimN + 32 -1) / 32; int blocksX = (dimN + 32 - 1) / 32;
int blocksY = 1; int blocksY = 1;
dim3 threads(32, 32); dim3 threads(32, 32);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixColumnOp_S<Agg, 32, 32><<< grid, threads, 0, STREAM_DEFAULT>>> KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimM, dimN); agg, A_d, C_d, dimM, dimN);
} }
return; return;
...@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::sum(), hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_sum failed"); CHECK_SYNC("hl_matrix_column_sum failed");
} }
...@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::max(), hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_max failed"); CHECK_SYNC("hl_matrix_column_max failed");
} }
...@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::min(), hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_min failed"); CHECK_SYNC("hl_matrix_column_min failed");
} }
...@@ -226,16 +184,16 @@ template <int blockSize> ...@@ -226,16 +184,16 @@ template <int blockSize>
__global__ void KeVectorSum(real *E, real *Sum, int dimM) { __global__ void KeVectorSum(real *E, real *Sum, int dimM) {
__shared__ double sum_s[blockSize]; __shared__ double sum_s[blockSize];
int tid = threadIdx.x; int tid = threadIdx.x;
int index = blockIdx.y*blockDim.x+threadIdx.x; int index = blockIdx.y * blockDim.x + threadIdx.x;
sum_s[tid] = 0.0f; sum_s[tid] = 0.0f;
while (index < dimM) { while (index < dimM) {
sum_s[tid] += E[index]; sum_s[tid] += E[index];
index += blockDim.x*gridDim.y; index += blockDim.x * gridDim.y;
} }
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] += sum_s[tid + stride]; sum_s[tid] += sum_s[tid + stride];
} }
...@@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) { ...@@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
dim3 threads(blockSize, 1); dim3 threads(blockSize, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
hl_event_t hl_event = &hl_event_st; hl_event_t hl_event = &hl_event_st;
while (!hl_cuda_event_is_ready(hl_event)) {} while (!hl_cuda_event_is_ready(hl_event)) {
}
KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(A_d, t_resource.gpu_mem, dimM); A_d, t_resource.gpu_mem, dimM);
KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
(t_resource.gpu_mem, t_resource.cpu_mem, 128); t_resource.gpu_mem, t_resource.cpu_mem, 128);
hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
hl_stream_synchronize(HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
cudaError_t err = (cudaError_t)hl_get_device_last_error(); cudaError_t err = (cudaError_t)hl_get_device_last_error();
CHECK_EQ(cudaSuccess, err) CHECK_EQ(cudaSuccess, err) << "CUDA error: "
<< "CUDA error: " << hl_get_device_error_string((size_t)err); << hl_get_device_error_string((size_t)err);
} }
template <int blockSize> template <int blockSize>
__global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) { __global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
__shared__ double sum_s[blockSize]; __shared__ double sum_s[blockSize];
int tid = threadIdx.x; int tid = threadIdx.x;
int index = blockIdx.y*blockDim.x+threadIdx.x; int index = blockIdx.y * blockDim.x + threadIdx.x;
sum_s[tid] = 0.0f; sum_s[tid] = 0.0f;
while (index < dimM) { while (index < dimM) {
sum_s[tid] += abs(E[index]); sum_s[tid] += abs(E[index]);
index += blockDim.x*gridDim.y; index += blockDim.x * gridDim.y;
} }
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] += sum_s[tid + stride]; sum_s[tid] += sum_s[tid + stride];
} }
...@@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) { ...@@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
dim3 threads(blockSize, 1); dim3 threads(blockSize, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
hl_event_t hl_event = &hl_event_st; hl_event_t hl_event = &hl_event_st;
while (!hl_cuda_event_is_ready(hl_event)) {} while (!hl_cuda_event_is_ready(hl_event)) {
}
KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(A_d, t_resource.gpu_mem, dimM); A_d, t_resource.gpu_mem, dimM);
KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
(t_resource.gpu_mem, t_resource.cpu_mem, 128); t_resource.gpu_mem, t_resource.cpu_mem, 128);
hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
hl_stream_synchronize(HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
cudaError_t err = (cudaError_t)hl_get_device_last_error(); cudaError_t err = (cudaError_t)hl_get_device_last_error();
CHECK_EQ(cudaSuccess, err) CHECK_EQ(cudaSuccess, err) << "CUDA error: "
<< "CUDA error: " << hl_get_device_error_string((size_t)err); << hl_get_device_error_string((size_t)err);
} }
...@@ -12,21 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,21 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <float.h> #include <float.h>
#include "hl_base.h" #include "hl_base.h"
#include "hl_cnn.h" #include "hl_cnn.h"
#include "hl_device_functions.cuh" #include "hl_device_functions.cuh"
__global__ void KeMaxPoolForward(const int nthreads, const real* inputData, __global__ void KeMaxPoolForward(const int nthreads,
const int channels, const int height, const real* inputData,
const int channels,
const int height,
const int width, const int width,
const int pooledH, const int pooledW, const int pooledH,
const int ksizeW, const int ksizeH, const int pooledW,
const int strideH, const int strideW, const int ksizeW,
const int offsetH, const int offsetW, const int ksizeH,
real* tgtData, const int tgtStride) { const int strideH,
int index = blockIdx.x * blockDim.x + threadIdx.x; const int strideW,
const int offsetH,
const int offsetW,
real* tgtData,
const int tgtStride) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nthreads) { if (index < nthreads) {
int pw = index % pooledW; int pw = index % pooledW;
int ph = (index / pooledW) % pooledH; int ph = (index / pooledW) % pooledH;
...@@ -46,44 +52,70 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData, ...@@ -46,44 +52,70 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
maxval = inputData[h * width + w]; maxval = inputData[h * width + w];
} }
} }
int tgtIndex = index % (pooledW * pooledH * channels) + int tgtIndex =
frameNum * tgtStride; index % (pooledW * pooledH * channels) + frameNum * tgtStride;
tgtData[tgtIndex] = maxval; tgtData[tgtIndex] = maxval;
} }
} }
void hl_maxpool_forward(const int frameCnt, const real* inputData, void hl_maxpool_forward(const int frameCnt,
const real* inputData,
const int channels, const int channels,
const int height, const int width, const int height,
const int pooledH, const int pooledW, const int width,
const int sizeX, const int sizeY, const int pooledH,
const int strideH, const int strideW, const int pooledW,
const int paddingH, const int paddingW, const int sizeX,
real* tgtData, const int tgtStride) { const int sizeY,
const int strideH,
const int strideW,
const int paddingH,
const int paddingW,
real* tgtData,
const int tgtStride) {
int num_kernels = pooledH * pooledW * channels * frameCnt; int num_kernels = pooledH * pooledW * channels * frameCnt;
int blocks = (num_kernels + 1024 - 1) / 1024; int blocks = (num_kernels + 1024 - 1) / 1024;
dim3 threads(1024, 1); dim3 threads(1024, 1);
dim3 grid(blocks, 1); dim3 grid(blocks, 1);
KeMaxPoolForward<<< grid, threads, 0, STREAM_DEFAULT >>> KeMaxPoolForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
(num_kernels, inputData, channels, height, width, inputData,
pooledH, pooledW, sizeX, sizeY, strideH, strideW, channels,
paddingH, paddingW, tgtData, tgtStride); height,
width,
pooledH,
pooledW,
sizeX,
sizeY,
strideH,
strideW,
paddingH,
paddingW,
tgtData,
tgtStride);
CHECK_SYNC("hl_maxpool_forward failed"); CHECK_SYNC("hl_maxpool_forward failed");
} }
__global__ void KeMaxPoolBackward(const int nthreads, const real* inputData, __global__ void KeMaxPoolBackward(const int nthreads,
const real* outData, const real* outGrad, const real* inputData,
const int channels, const int height, const real* outData,
const real* outGrad,
const int channels,
const int height,
const int width, const int width,
const int pooledH, const int pooledW, const int pooledH,
const int sizeX, const int sizeY, const int pooledW,
const int strideH, const int strideW, const int sizeX,
const int padH, const int padW, const int sizeY,
real scaleA, real scaleB, const int strideH,
real* targetGrad, const int outStride) { const int strideW,
int index = blockIdx.x * blockDim.x + threadIdx.x; const int padH,
const int padW,
real scaleA,
real scaleB,
real* targetGrad,
const int outStride) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nthreads) { if (index < nthreads) {
// find out the local index // find out the local index
// find out the local offset // find out the local offset
...@@ -107,43 +139,69 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData, ...@@ -107,43 +139,69 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
} }
} }
} }
targetGrad[index] = targetGrad[index] = scaleB * targetGrad[index] + scaleA * gradient;
scaleB * targetGrad[index] + scaleA * gradient;
} }
} }
void hl_maxpool_backward(const int frameCnt, const real* inputData, void hl_maxpool_backward(const int frameCnt,
const real* outData, const real* outGrad, const real* inputData,
const int channels, const int height, const real* outData,
const int width, const real* outGrad,
const int pooledH, const int pooledW, const int channels,
const int sizeX, const int sizeY, const int height,
const int strideH, const int strideW, const int width,
const int paddingH, const int paddingW, const int pooledH,
real scaleA, real scaleB, const int pooledW,
real* targetGrad, const int outStride) { const int sizeX,
const int sizeY,
const int strideH,
const int strideW,
const int paddingH,
const int paddingW,
real scaleA,
real scaleB,
real* targetGrad,
const int outStride) {
int num_kernels = height * width * channels * frameCnt; int num_kernels = height * width * channels * frameCnt;
int blocks = (num_kernels + 1024 - 1) / 1024; int blocks = (num_kernels + 1024 - 1) / 1024;
KeMaxPoolBackward<<< blocks, 1024, 0, STREAM_DEFAULT >>> KeMaxPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
(num_kernels, inputData, outData, outGrad, channels, inputData,
height, width, pooledH, pooledW, sizeX, sizeY, outData,
strideH, strideW, outGrad,
paddingH, paddingW, channels,
scaleA, scaleB, height,
targetGrad, outStride); width,
pooledH,
pooledW,
sizeX,
sizeY,
strideH,
strideW,
paddingH,
paddingW,
scaleA,
scaleB,
targetGrad,
outStride);
CHECK_SYNC("hl_maxpool_backward"); CHECK_SYNC("hl_maxpool_backward");
} }
__global__ void KeAvgPoolForward(const int nthreads, const real* inputData, __global__ void KeAvgPoolForward(const int nthreads,
const real* inputData,
const int channels, const int channels,
const int height, const int width, const int height,
const int pooledH, const int pooledW, const int width,
const int sizeX, const int sizeY, const int pooledH,
const int strideH, const int strideW, const int pooledW,
const int padH, const int padW, const int sizeX,
real* tgtData, const int tgtStride) { const int sizeY,
const int strideH,
const int strideW,
const int padH,
const int padW,
real* tgtData,
const int tgtStride) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nthreads) { if (index < nthreads) {
int pw = index % pooledW; int pw = index % pooledW;
...@@ -168,39 +226,64 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData, ...@@ -168,39 +226,64 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
aveval += inputData[h * width + w]; aveval += inputData[h * width + w];
} }
} }
int tgtIndex = index % (pooledW * pooledH * channels) + int tgtIndex =
frameNum * tgtStride; index % (pooledW * pooledH * channels) + frameNum * tgtStride;
tgtData[tgtIndex] = aveval / pool_size; tgtData[tgtIndex] = aveval / pool_size;
} }
} }
void hl_avgpool_forward(const int frameCnt, const real* inputData, void hl_avgpool_forward(const int frameCnt,
const real* inputData,
const int channels, const int channels,
const int height, const int width, const int height,
const int pooledH, const int pooledW, const int width,
const int sizeX, const int sizeY, const int pooledH,
const int strideH, const int strideW, const int pooledW,
const int paddingH, const int paddingW, const int sizeX,
real* tgtData, const int tgtStride) { const int sizeY,
const int strideH,
const int strideW,
const int paddingH,
const int paddingW,
real* tgtData,
const int tgtStride) {
int num_kernels = pooledH * pooledW * channels * frameCnt; int num_kernels = pooledH * pooledW * channels * frameCnt;
int blocks = (num_kernels + 1024 - 1) / 1024; int blocks = (num_kernels + 1024 - 1) / 1024;
KeAvgPoolForward<<< blocks, 1024, 0, STREAM_DEFAULT >>> KeAvgPoolForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
(num_kernels, inputData, channels, inputData,
height, width, pooledH, pooledW, channels,
sizeX, sizeY, strideH, strideW, height,
paddingH, paddingW, tgtData, tgtStride); width,
pooledH,
pooledW,
sizeX,
sizeY,
strideH,
strideW,
paddingH,
paddingW,
tgtData,
tgtStride);
CHECK_SYNC("hl_avgpool_forward failed"); CHECK_SYNC("hl_avgpool_forward failed");
} }
__global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad, __global__ void KeAvgPoolBackward(const int nthreads,
const int channels, const int height, const real* outGrad,
const int channels,
const int height,
const int width, const int width,
const int pooledH, const int pooledW, const int pooledH,
const int sizeX, const int sizeY, const int pooledW,
const int strideH, const int strideW, const int sizeX,
const int padH, const int padW, const int sizeY,
real scaleA, real scaleB, const int strideH,
real* tgtGrad, const int outStride) { const int strideW,
const int padH,
const int padW,
real scaleA,
real scaleB,
real* tgtGrad,
const int outStride) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nthreads) { if (index < nthreads) {
int offsetW = index % width + padW; int offsetW = index % width + padW;
...@@ -215,7 +298,6 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad, ...@@ -215,7 +298,6 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
real gradient = 0; real gradient = 0;
outGrad += (frameNum * outStride + offsetC * pooledH * pooledW); outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
for (int ph = phstart; ph < phend; ++ph) { for (int ph = phstart; ph < phend; ++ph) {
for (int pw = pwstart; pw < pwend; ++pw) { for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size // figure out the pooling size
...@@ -224,32 +306,50 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad, ...@@ -224,32 +306,50 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
int hend = min(hstart + sizeY, height + padH); int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW); int wend = min(wstart + sizeX, width + padW);
int poolsize = (hend - hstart) * (wend - wstart); int poolsize = (hend - hstart) * (wend - wstart);
gradient += outGrad[ph * pooledW + pw]/poolsize; gradient += outGrad[ph * pooledW + pw] / poolsize;
} }
} }
tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient; tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
} }
} }
void hl_avgpool_backward(const int frameCnt, const real* outGrad, void hl_avgpool_backward(const int frameCnt,
const real* outGrad,
const int channels, const int channels,
const int height, const int width, const int height,
const int pooledH, const int pooledW, const int width,
const int sizeX, const int sizeY, const int pooledH,
const int strideH, const int strideW, const int pooledW,
const int paddingH, const int paddingW, const int sizeX,
real scaleA, real scaleB, const int sizeY,
real* backGrad, const int outStride) { const int strideH,
const int strideW,
const int paddingH,
const int paddingW,
real scaleA,
real scaleB,
real* backGrad,
const int outStride) {
int num_kernels = height * width * channels * frameCnt; int num_kernels = height * width * channels * frameCnt;
int blocks = (num_kernels + 1024 - 1) / 1024; int blocks = (num_kernels + 1024 - 1) / 1024;
KeAvgPoolBackward <<< blocks, 1024, 0, STREAM_DEFAULT >>> KeAvgPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
(num_kernels, outGrad, channels, height, width, outGrad,
pooledH, pooledW, sizeX, sizeY, channels,
strideH, strideW, height,
paddingH, paddingW, width,
scaleA, scaleB, pooledH,
backGrad, outStride); pooledW,
sizeX,
sizeY,
strideH,
strideW,
paddingH,
paddingW,
scaleA,
scaleB,
backGrad,
outStride);
CHECK_SYNC("hl_avgpool_backward failed"); CHECK_SYNC("hl_avgpool_backward failed");
} }
...@@ -266,7 +366,7 @@ __global__ void KeBilinearInterpFw(const real* in, ...@@ -266,7 +366,7 @@ __global__ void KeBilinearInterpFw(const real* in,
const size_t numChannels, const size_t numChannels,
const real ratioH, const real ratioH,
const real ratioW) { const real ratioW) {
int nthreads = outputH * outputW; int nthreads = outputH * outputW;
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < nthreads) { if (tid < nthreads) {
int outIdH = tid / outputW; int outIdH = tid / outputW;
...@@ -287,13 +387,14 @@ __global__ void KeBilinearInterpFw(const real* in, ...@@ -287,13 +387,14 @@ __global__ void KeBilinearInterpFw(const real* in,
real w1lambda = ratioW * outImgIdx - inImgIdx; real w1lambda = ratioW * outImgIdx - inImgIdx;
real w2lambda = 1.f - w1lambda; real w2lambda = 1.f - w1lambda;
const real* inPos = const real* inPos = &in[outIdH * inputW + channelId * inImgSize +
&in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx]; inImgIdy * inImgW + inImgIdx];
// bilinear interpolation // bilinear interpolation
out[outIdH * outputW + outIdW] = out[outIdH * outputW + outIdW] =
h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) + h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) +
h1lambda * (w2lambda * inPos[hId * inImgW] + w1lambda * inPos[hId * inImgW + wId]); h1lambda * (w2lambda * inPos[hId * inImgW] +
w1lambda * inPos[hId * inImgW + wId]);
} }
} }
...@@ -313,9 +414,19 @@ void hl_bilinear_forward(const real* inData, ...@@ -313,9 +414,19 @@ void hl_bilinear_forward(const real* inData,
int threadNum = outputH * outputW; int threadNum = outputH * outputW;
int blocks = (threadNum + 1024 - 1) / 1024; int blocks = (threadNum + 1024 - 1) / 1024;
KeBilinearInterpFw<<< blocks, 1024, 0, STREAM_DEFAULT>>>( KeBilinearInterpFw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inData,
inData, inImgH, inImgW, inputH, inputW, outData, outImgH, inImgH,
outImgW, outputH, outputW, numChannels, ratioH, ratioW); inImgW,
inputH,
inputW,
outData,
outImgH,
outImgW,
outputH,
outputW,
numChannels,
ratioH,
ratioW);
CHECK_SYNC("hl_bilinear_forward failed"); CHECK_SYNC("hl_bilinear_forward failed");
} }
...@@ -353,13 +464,15 @@ __global__ void KeBilinearInterpBw(real* in, ...@@ -353,13 +464,15 @@ __global__ void KeBilinearInterpBw(real* in,
real w1lambda = ratioW * outImgIdx - inImgIdx; real w1lambda = ratioW * outImgIdx - inImgIdx;
real w2lambda = 1.f - w1lambda; real w2lambda = 1.f - w1lambda;
real* inPos = real* inPos = &in[outIdH * inputW + channelId * inImgSize +
&in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx]; inImgIdy * inImgW + inImgIdx];
const real* outPos = &out[outIdH * outputW + outIdW]; const real* outPos = &out[outIdH * outputW + outIdW];
paddle::paddleAtomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]); paddle::paddleAtomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]);
paddle::paddleAtomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]); paddle::paddleAtomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]);
paddle::paddleAtomicAdd(&inPos[hId * inImgW], h1lambda * w2lambda * outPos[0]); paddle::paddleAtomicAdd(&inPos[hId * inImgW],
paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId], h1lambda * w1lambda * outPos[0]); h1lambda * w2lambda * outPos[0]);
paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId],
h1lambda * w1lambda * outPos[0]);
} }
} }
...@@ -379,22 +492,37 @@ void hl_bilinear_backward(real* inGrad, ...@@ -379,22 +492,37 @@ void hl_bilinear_backward(real* inGrad,
int threadNum = outputH * outputW; int threadNum = outputH * outputW;
int blocks = (threadNum + 1024 - 1) / 1024; int blocks = (threadNum + 1024 - 1) / 1024;
KeBilinearInterpBw<<< blocks, 1024, 0, STREAM_DEFAULT>>>( KeBilinearInterpBw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inGrad,
inGrad, inImgH, inImgW, inputH, inputW, outGrad, outImgH, inImgH,
outImgW, outputH, outputW, numChannels, ratioH, ratioW); inImgW,
inputH,
inputW,
outGrad,
outImgH,
outImgW,
outputH,
outputW,
numChannels,
ratioH,
ratioW);
CHECK_SYNC("hl_bilinear_backward failed"); CHECK_SYNC("hl_bilinear_backward failed");
} }
__global__ void maxoutFpCompute(size_t nthreads, const real * inData, __global__ void maxoutFpCompute(size_t nthreads,
real * outData, int* idData, const real* inData,
size_t size, size_t featLen, size_t groups) { real* outData,
int* idData,
size_t size,
size_t featLen,
size_t groups) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index < nthreads) { if (index < nthreads) {
size_t batch_idx = index / size; size_t batch_idx = index / size;
size_t i = index % size; size_t i = index % size;
size_t channel_idx = i / featLen; size_t channel_idx = i / featLen;
size_t feat_idx = i % featLen; size_t feat_idx = i % featLen;
size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx; size_t data_idx =
(batch_idx * size + channel_idx * featLen) * groups + feat_idx;
real max = inData[data_idx]; real max = inData[data_idx];
int maxId = 0; int maxId = 0;
for (size_t g = 1; g < groups; ++g) { for (size_t g = 1; g < groups; ++g) {
...@@ -409,37 +537,50 @@ __global__ void maxoutFpCompute(size_t nthreads, const real * inData, ...@@ -409,37 +537,50 @@ __global__ void maxoutFpCompute(size_t nthreads, const real * inData,
} }
} }
void hl_maxout_forward(const real* inData, real* outData, void hl_maxout_forward(const real* inData,
int* idData, size_t batchSize, size_t size, real* outData,
size_t featLen, size_t groups) { int* idData,
size_t batchSize,
size_t size,
size_t featLen,
size_t groups) {
int num_kernels = size * batchSize; int num_kernels = size * batchSize;
int blocks = (num_kernels + 1024 - 1) / 1024; int blocks = (num_kernels + 1024 - 1) / 1024;
maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>( maxoutFpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
num_kernels, inData, outData, idData, size, featLen, groups); num_kernels, inData, outData, idData, size, featLen, groups);
CHECK_SYNC("hl_maxout_forward failed"); CHECK_SYNC("hl_maxout_forward failed");
} }
__global__ void maxoutBpCompute(size_t nthreads, real* inGrad, __global__ void maxoutBpCompute(size_t nthreads,
const real* outGrad, const int* idData, real* inGrad,
size_t size, size_t featLen, size_t groups) { const real* outGrad,
const int* idData,
size_t size,
size_t featLen,
size_t groups) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index < nthreads) { if (index < nthreads) {
size_t batch_idx = index / size; size_t batch_idx = index / size;
size_t i = index % size; size_t i = index % size;
size_t channel_idx = i / featLen; size_t channel_idx = i / featLen;
size_t feat_idx = i % featLen; size_t feat_idx = i % featLen;
size_t newIndex = batch_idx * size; size_t newIndex = batch_idx * size;
size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx; size_t gradIdx =
(channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
(inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i]; (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
} }
} }
void hl_maxout_backward(real* inGrad, const real* outGrad, void hl_maxout_backward(real* inGrad,
const int* idData, size_t batchSize, size_t size, const real* outGrad,
size_t featLen, size_t groups) { const int* idData,
size_t batchSize,
size_t size,
size_t featLen,
size_t groups) {
int num_kernels = size * batchSize; int num_kernels = size * batchSize;
int blocks = (num_kernels + 1024 - 1) / 1024; int blocks = (num_kernels + 1024 - 1) / 1024;
maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>( maxoutBpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
num_kernels, inGrad, outGrad, idData, size, featLen, groups); num_kernels, inGrad, outGrad, idData, size, featLen, groups);
CHECK_SYNC("hl_maxout_backward failed"); CHECK_SYNC("hl_maxout_backward failed");
} }
...@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_activation_functions.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_cuda_cublas.h" #include "hl_cuda_cublas.h"
#include "hl_device_functions.cuh" #include "hl_device_functions.cuh"
#include "hl_activation_functions.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
typedef hppl::Active<real>::forward t_forward; typedef hppl::Active<real>::forward t_forward;
typedef hppl::Active<real>::backward t_backward; typedef hppl::Active<real>::backward t_backward;
bool hl_lstm_sequence_parallel(int frameSize) { bool hl_lstm_sequence_parallel(int frameSize) {
...@@ -42,9 +41,9 @@ public: ...@@ -42,9 +41,9 @@ public:
value_ += (start + length - 1) * frameSize + idx; value_ += (start + length - 1) * frameSize + idx;
} }
} }
__device__ inline real *getPtr() const {return value_;} __device__ inline real *getPtr() const { return value_; }
__device__ inline real getValue() {return *value_;} __device__ inline real getValue() { return *value_; }
__device__ inline void setValue(real value) {*value_ = value;} __device__ inline void setValue(real value) { *value_ = value; }
template <int reversed, int frameSize> template <int reversed, int frameSize>
__device__ inline void nextFrame() { __device__ inline void nextFrame() {
if (reversed == 0) { if (reversed == 0) {
...@@ -55,28 +54,25 @@ public: ...@@ -55,28 +54,25 @@ public:
} }
}; };
__device__ __forceinline__ __device__ __forceinline__ void ptx_sync(const int id, const int barriers) {
void ptx_sync(const int id, const int barriers) {
asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory"); asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory");
} }
__device__ __forceinline__ __device__ __forceinline__ void ptx_arrive(const int id, const int barriers) {
void ptx_arrive(const int id, const int barriers) {
asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory"); asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory");
} }
template<int valueSize, int frameSize> template <int valueSize, int frameSize>
__device__ __forceinline__ real __device__ __forceinline__ real forward_sequence(real value,
forward_sequence(real value, real *shValue,
real *shValue, real *state,
real *state, real *preOutput,
real *preOutput, real *output,
real *output, real check,
real check, int index,
int index, t_forward activeNode,
t_forward activeNode, t_forward activeGate,
t_forward activeGate, t_forward activeState) {
t_forward activeState) {
real out; real out;
real prevOut; real prevOut;
real state_r; real state_r;
...@@ -112,17 +108,20 @@ forward_sequence(real value, ...@@ -112,17 +108,20 @@ forward_sequence(real value,
if (idy == 0) { if (idy == 0) {
ptx_sync(2, frameSize * 2); ptx_sync(2, frameSize * 2);
prevOut = state[idx]; prevOut = state[idx];
prevOut = activeState(prevOut); prevOut = activeState(prevOut);
preOutput[idx] = prevOut; preOutput[idx] = prevOut;
ptx_arrive(3, frameSize * 2); ptx_arrive(3, frameSize * 2);
} }
return value; return value;
} }
#define OUTPUT_BARRIER_ID 10 #define OUTPUT_BARRIER_ID 10
#define OUTPUT_BARRIER_ID2 11 #define OUTPUT_BARRIER_ID2 11
template<int valueSize, int frameSize, int reversed, template <int valueSize,
int computeThreads, int blockSize> int frameSize,
int reversed,
int computeThreads,
int blockSize>
__global__ void KeLstmForward(real *gateValue, __global__ void KeLstmForward(real *gateValue,
real *state, real *state,
real *output, real *output,
...@@ -184,10 +183,16 @@ __global__ void KeLstmForward(real *gateValue, ...@@ -184,10 +183,16 @@ __global__ void KeLstmForward(real *gateValue,
} }
} }
value = forward_sequence<valueSize, frameSize>( value = forward_sequence<valueSize, frameSize>(
value, shValue, shState, shPrevOutput, shOutput, check, index, value,
hppl::gpu::forward[active_node], shValue,
hppl::gpu::forward[active_gate], shState,
hppl::gpu::forward[active_state]); shPrevOutput,
shOutput,
check,
index,
hppl::gpu::forward[active_node],
hppl::gpu::forward[active_gate],
hppl::gpu::forward[active_state]);
const int idx = index % frameSize; const int idx = index % frameSize;
const int idy = index / frameSize; const int idy = index / frameSize;
if (valueSize == 128) { if (valueSize == 128) {
...@@ -218,7 +223,7 @@ __global__ void KeLstmForward(real *gateValue, ...@@ -218,7 +223,7 @@ __global__ void KeLstmForward(real *gateValue,
real B_r[frameSize]; real B_r[frameSize];
const int computeIdx = index - valueSize; const int computeIdx = index - valueSize;
if (i == 0) { if (i == 0) {
#pragma unroll #pragma unroll
for (int n = 0; n < frameSize; n++) { for (int n = 0; n < frameSize; n++) {
B_r[n] = weight[n * valueSize + computeIdx]; B_r[n] = weight[n * valueSize + computeIdx];
} }
...@@ -230,7 +235,7 @@ __global__ void KeLstmForward(real *gateValue, ...@@ -230,7 +235,7 @@ __global__ void KeLstmForward(real *gateValue,
} }
real sum = 0.0f; real sum = 0.0f;
for (int n = 0; n < frameSize; n++) { for (int n = 0; n < frameSize; n++) {
sum += A_r[n]*B_r[n]; sum += A_r[n] * B_r[n];
} }
shValue[computeIdx] = sum; shValue[computeIdx] = sum;
ptx_arrive(OUTPUT_BARRIER_ID2, blockSize); ptx_arrive(OUTPUT_BARRIER_ID2, blockSize);
...@@ -239,14 +244,14 @@ __global__ void KeLstmForward(real *gateValue, ...@@ -239,14 +244,14 @@ __global__ void KeLstmForward(real *gateValue,
if (valueSize == 256) { if (valueSize == 256) {
real B_r[frameSize]; real B_r[frameSize];
if (i == 0) { if (i == 0) {
#pragma unroll #pragma unroll
for (int n = 0; n < frameSize; n++) { for (int n = 0; n < frameSize; n++) {
B_r[n] = weight[n * valueSize + index]; B_r[n] = weight[n * valueSize + index];
} }
} }
real sum = 0.0f; real sum = 0.0f;
for (int n = 0; n < frameSize; n++) { for (int n = 0; n < frameSize; n++) {
sum += shOutput[n]*B_r[n]; sum += shOutput[n] * B_r[n];
} }
value += sum; value += sum;
} }
...@@ -273,50 +278,81 @@ void hl_lstm_parallel_forward(real *gateValue, ...@@ -273,50 +278,81 @@ void hl_lstm_parallel_forward(real *gateValue,
dim3 grid(numSequences, 1); dim3 grid(numSequences, 1);
if (!reversed) { if (!reversed) {
if (frameSize == 32) { if (frameSize == 32) {
KeLstmForward<128, 32, 0, 128, 256> KeLstmForward<128, 32, 0, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
<<<grid, 256, 0, STREAM_DEFAULT>>> gateValue,
(gateValue, stateValue, outputValue, preOutputValue, stateValue,
checkIg, checkFg, checkOg, weight, sequence, outputValue,
active_node, active_gate, active_state); preOutputValue,
checkIg,
checkFg,
checkOg,
weight,
sequence,
active_node,
active_gate,
active_state);
} else if (frameSize == 64) { } else if (frameSize == 64) {
KeLstmForward<256, 64, 0, 256, 256> KeLstmForward<256, 64, 0, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
<<<grid, 256, 0, STREAM_DEFAULT>>> gateValue,
(gateValue, stateValue, outputValue, preOutputValue, stateValue,
checkIg, checkFg, checkOg, weight, sequence, outputValue,
active_node, active_gate, active_state); preOutputValue,
checkIg,
checkFg,
checkOg,
weight,
sequence,
active_node,
active_gate,
active_state);
} }
} else { } else {
if (frameSize == 32) { if (frameSize == 32) {
KeLstmForward<128, 32, 1, 128, 256> KeLstmForward<128, 32, 1, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
<<<grid, 256, 0, STREAM_DEFAULT>>> gateValue,
(gateValue, stateValue, outputValue, preOutputValue, stateValue,
checkIg, checkFg, checkOg, weight, sequence, outputValue,
active_node, active_gate, active_state); preOutputValue,
checkIg,
checkFg,
checkOg,
weight,
sequence,
active_node,
active_gate,
active_state);
} else if (frameSize == 64) { } else if (frameSize == 64) {
KeLstmForward<256, 64, 1, 256, 256> KeLstmForward<256, 64, 1, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
<<<grid, 256, 0, STREAM_DEFAULT>>> gateValue,
(gateValue, stateValue, outputValue, preOutputValue, stateValue,
checkIg, checkFg, checkOg, weight, sequence, outputValue,
active_node, active_gate, active_state); preOutputValue,
checkIg,
checkFg,
checkOg,
weight,
sequence,
active_node,
active_gate,
active_state);
} }
} }
CHECK_SYNC("hl_lstm_parallel_forward failed"); CHECK_SYNC("hl_lstm_parallel_forward failed");
} }
__device__ __forceinline__ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
void transpose_32x32(real a[], const int idx) {
int addr = idx % 32; int addr = idx % 32;
#pragma unroll #pragma unroll
for (int k = 1; k < 32; k++) { for (int k = 1; k < 32; k++) {
// rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32); // rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32);
addr = __shfl(addr, (idx + 1) % 32, 32); addr = __shfl(addr, (idx + 1) % 32, 32);
a[k] = __shfl(a[k], addr, 32); a[k] = __shfl(a[k], addr, 32);
} }
#pragma unroll #pragma unroll
for (int tid = 0; tid < 31; tid++) { for (int tid = 0; tid < 31; tid++) {
real tmp = (idx > tid) ? a[0] : a[1]; real tmp = (idx > tid) ? a[0] : a[1];
#pragma unroll #pragma unroll
for (int k = 31; k > 0; k--) { for (int k = 31; k > 0; k--) {
a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32]; a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32];
} }
...@@ -324,29 +360,28 @@ void transpose_32x32(real a[], const int idx) { ...@@ -324,29 +360,28 @@ void transpose_32x32(real a[], const int idx) {
} }
addr = (32 - idx) % 32; addr = (32 - idx) % 32;
#pragma unroll #pragma unroll
for (int k = 0; k < 32; k++) { for (int k = 0; k < 32; k++) {
a[k] = __shfl(a[k], addr, 32); a[k] = __shfl(a[k], addr, 32);
addr = __shfl(addr, (idx + 31) % 32, 32); addr = __shfl(addr, (idx + 31) % 32, 32);
} }
} }
template<int valueSize, int frameSize> template <int valueSize, int frameSize>
__device__ void __device__ void backward_sequence(real rGateValue,
backward_sequence(real rGateValue, real rOutputGrad,
real rOutputGrad, real rPreOutputValue,
real rPreOutputValue, real &rGateGrad,
real &rGateGrad, real &rStateGrad,
real &rStateGrad, real *shStateGrad,
real *shStateGrad, real *shStateValue,
real *shStateValue, real *shGateValue,
real *shGateValue, real rCheck,
real rCheck, real &rGateValuePrev,
real &rGateValuePrev, int index,
int index, t_backward activeNode,
t_backward activeNode, t_backward activeGate,
t_backward activeGate, t_backward activeState) {
t_backward activeState) {
const int frameIdx = index % frameSize; const int frameIdx = index % frameSize;
const int frameIdy = index / frameSize; const int frameIdy = index / frameSize;
if (frameIdy == 3) { if (frameIdy == 3) {
...@@ -363,8 +398,8 @@ backward_sequence(real rGateValue, ...@@ -363,8 +398,8 @@ backward_sequence(real rGateValue,
rStateGrad = rGateGrad * rCheck; rStateGrad = rGateGrad * rCheck;
shStateGrad[index] = rStateGrad; shStateGrad[index] = rStateGrad;
ptx_sync(3, valueSize); ptx_sync(3, valueSize);
rStateGrad += shStateGrad[frameIdx + frameSize *2]; rStateGrad += shStateGrad[frameIdx + frameSize * 2];
rStateGrad += shStateGrad[frameIdx + frameSize *3]; rStateGrad += shStateGrad[frameIdx + frameSize * 3];
rGateGrad = rStateGrad * shGateValue[frameIdx]; rGateGrad = rStateGrad * shGateValue[frameIdx];
rGateGrad = activeGate(rGateGrad, rGateValue); rGateGrad = activeGate(rGateGrad, rGateValue);
} else if (frameIdy == 2) { } else if (frameIdy == 2) {
...@@ -373,7 +408,7 @@ backward_sequence(real rGateValue, ...@@ -373,7 +408,7 @@ backward_sequence(real rGateValue,
shStateGrad[index] = rStateGrad; shStateGrad[index] = rStateGrad;
ptx_sync(3, valueSize); ptx_sync(3, valueSize);
rStateGrad += shStateGrad[frameIdx + frameSize]; rStateGrad += shStateGrad[frameIdx + frameSize];
rStateGrad += shStateGrad[frameIdx + frameSize *3]; rStateGrad += shStateGrad[frameIdx + frameSize * 3];
rGateValuePrev = rGateValue; rGateValuePrev = rGateValue;
rGateGrad = rStateGrad * shStateValue[frameIdx]; rGateGrad = rStateGrad * shStateValue[frameIdx];
rGateGrad = activeGate(rGateGrad, rGateValue); rGateGrad = activeGate(rGateGrad, rGateValue);
...@@ -381,43 +416,43 @@ backward_sequence(real rGateValue, ...@@ -381,43 +416,43 @@ backward_sequence(real rGateValue,
shGateValue[frameIdx] = rGateValue; shGateValue[frameIdx] = rGateValue;
ptx_sync(3, valueSize); ptx_sync(3, valueSize);
rStateGrad = shStateGrad[frameIdx + frameSize]; rStateGrad = shStateGrad[frameIdx + frameSize];
rStateGrad += shStateGrad[frameIdx + frameSize *2]; rStateGrad += shStateGrad[frameIdx + frameSize * 2];
rStateGrad += shStateGrad[frameIdx + frameSize *3]; rStateGrad += shStateGrad[frameIdx + frameSize * 3];
rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize]; rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize];
rGateGrad = activeNode(rGateGrad, rGateValue); rGateGrad = activeNode(rGateGrad, rGateValue);
} }
} }
template<int valueSize, int frameSize> template <int valueSize, int frameSize>
__device__ void load_weight(real rWeight[], real *weight, const int index) { __device__ void load_weight(real rWeight[], real *weight, const int index) {
if (valueSize == 128) { if (valueSize == 128) {
weight += index; weight += index;
#pragma unroll #pragma unroll
for (int n = 0; n < frameSize; n++) { for (int n = 0; n < frameSize; n++) {
rWeight[n] = weight[n*valueSize]; rWeight[n] = weight[n * valueSize];
} }
transpose_32x32(rWeight, index % 32); transpose_32x32(rWeight, index % 32);
} }
if (valueSize == 256) { if (valueSize == 256) {
int id = (index / 32) % 2; int id = (index / 32) % 2;
weight += index - id * 32 + id * 32 * valueSize; weight += index - id * 32 + id * 32 * valueSize;
#pragma unroll #pragma unroll
for (int n = 0; n < 32; n++) { for (int n = 0; n < 32; n++) {
rWeight[n] = weight[n*valueSize]; rWeight[n] = weight[n * valueSize];
rWeight[n + 32] = weight[n*valueSize + 32]; rWeight[n + 32] = weight[n * valueSize + 32];
} }
transpose_32x32(rWeight, index % 32); transpose_32x32(rWeight, index % 32);
transpose_32x32(&rWeight[32], index % 32); transpose_32x32(&rWeight[32], index % 32);
} }
} }
template<int valueSize, int frameSize, int reversed> template <int valueSize, int frameSize, int reversed>
__global__ void KeLstmBackward(real *gateValue, __global__ void KeLstmBackward(real *gateValue,
real *gateGrad, real *gateGrad,
real *stateValue, real *stateValue,
real *stateGrad, /* do not need save */ real *stateGrad, /* do not need save */
real *preOutputValue, real *preOutputValue,
real *preOutputGrad, /* do not need save */ real *preOutputGrad, /* do not need save */
real *checkIg, real *checkIg,
real *checkIgGrad, real *checkIgGrad,
real *checkFg, real *checkFg,
...@@ -484,20 +519,27 @@ __global__ void KeLstmBackward(real *gateValue, ...@@ -484,20 +519,27 @@ __global__ void KeLstmBackward(real *gateValue,
for (int i = 0; i < length; ++i) { for (int i = 0; i < length; ++i) {
if (frameIdy == 3) { if (frameIdy == 3) {
if (i != length -1) { if (i != length - 1) {
frameStateValue.nextFrame<!reversed, frameSize>(); frameStateValue.nextFrame<!reversed, frameSize>();
shStateValue[frameIdx] = frameStateValue.getValue(); shStateValue[frameIdx] = frameStateValue.getValue();
} else { } else {
shStateValue[frameIdx] = 0.0; shStateValue[frameIdx] = 0.0;
} }
} }
backward_sequence<valueSize, frameSize>( backward_sequence<valueSize, frameSize>(rGateValue,
rGateValue, rOutputGrad, rPreOutputValue, rGateGrad, rOutputGrad,
rStateGrad, shStateGrad, shStateValue, shGateValue, rPreOutputValue,
rCheck, rGateValuePrev, index, rGateGrad,
hppl::gpu::backward[active_node], rStateGrad,
hppl::gpu::backward[active_gate], shStateGrad,
hppl::gpu::backward[active_state]); shStateValue,
shGateValue,
rCheck,
rGateValuePrev,
index,
hppl::gpu::backward[active_node],
hppl::gpu::backward[active_gate],
hppl::gpu::backward[active_state]);
if (frameIdy == 3) { if (frameIdy == 3) {
rCheckGrad += rGateGrad * rStateValue; rCheckGrad += rGateGrad * rStateValue;
rStateValue = shStateValue[frameIdx]; rStateValue = shStateValue[frameIdx];
...@@ -523,9 +565,9 @@ __global__ void KeLstmBackward(real *gateValue, ...@@ -523,9 +565,9 @@ __global__ void KeLstmBackward(real *gateValue,
shGateGrad[frameIdy][frameIdx] = rGateGrad; shGateGrad[frameIdy][frameIdx] = rGateGrad;
if (valueSize == 128) { if (valueSize == 128) {
real sum = 0.0f; real sum = 0.0f;
#pragma unroll #pragma unroll
for (int n = 0; n < frameSize; n++) { for (int n = 0; n < frameSize; n++) {
sum += shGateGrad[frameIdy][n]*B_r[n]; sum += shGateGrad[frameIdy][n] * B_r[n];
} }
if (frameIdy == 3) { if (frameIdy == 3) {
rOutputGrad += sum; rOutputGrad += sum;
...@@ -541,7 +583,7 @@ __global__ void KeLstmBackward(real *gateValue, ...@@ -541,7 +583,7 @@ __global__ void KeLstmBackward(real *gateValue,
} }
real sum = 0.0f; real sum = 0.0f;
for (int n = 0; n < frameSize; n++) { for (int n = 0; n < frameSize; n++) {
sum += A_r[n]*B_r[n]; sum += A_r[n] * B_r[n];
} }
if (frameIdy == 3) { if (frameIdy == 3) {
rOutputGrad += sum; rOutputGrad += sum;
...@@ -552,8 +594,8 @@ __global__ void KeLstmBackward(real *gateValue, ...@@ -552,8 +594,8 @@ __global__ void KeLstmBackward(real *gateValue,
if (frameIdy == 3) { if (frameIdy == 3) {
ptx_sync(6, valueSize); ptx_sync(6, valueSize);
#pragma unroll #pragma unroll
for (int i = 0; i < 3; i ++) { for (int i = 0; i < 3; i++) {
rOutputGrad += shOutputGrad[i][frameIdx]; rOutputGrad += shOutputGrad[i][frameIdx];
} }
} else { } else {
...@@ -564,11 +606,14 @@ __global__ void KeLstmBackward(real *gateValue, ...@@ -564,11 +606,14 @@ __global__ void KeLstmBackward(real *gateValue,
/* TODO: Temporary save & merger in another kernel */ /* TODO: Temporary save & merger in another kernel */
if (frameIdy == 1) { if (frameIdy == 1) {
if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad); if (checkIgGrad)
paddle::paddleAtomicAdd(checkIgGrad + frameIdx, rCheckGrad);
} else if (frameIdy == 2) { } else if (frameIdy == 2) {
if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad); if (checkFgGrad)
paddle::paddleAtomicAdd(checkFgGrad + frameIdx, rCheckGrad);
} else if (frameIdy == 3) { } else if (frameIdy == 3) {
if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad); if (checkOgGrad)
paddle::paddleAtomicAdd(checkOgGrad + frameIdx, rCheckGrad);
} }
} }
...@@ -593,68 +638,183 @@ void hl_lstm_parallel_backward_data(real *gateValue, ...@@ -593,68 +638,183 @@ void hl_lstm_parallel_backward_data(real *gateValue,
hl_activation_mode_t active_node, hl_activation_mode_t active_node,
hl_activation_mode_t active_gate, hl_activation_mode_t active_gate,
hl_activation_mode_t active_state) { hl_activation_mode_t active_state) {
CHECK(frameSize == 32 || frameSize == 64 || CHECK(frameSize == 32 || frameSize == 64 || frameSize == 128 ||
frameSize == 128 || frameSize == 256); frameSize == 256);
dim3 grid(numSequences, 1); dim3 grid(numSequences, 1);
if (!reversed) { if (!reversed) {
if (frameSize == 32) { if (frameSize == 32) {
KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>> KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>>(
(gateValue, gateGrad, stateValue, stateGrad, preOutputValue, gateValue,
preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, gateGrad,
checkOgGrad, outputGrad, weight, sequence, stateValue,
active_node, active_gate, active_state); stateGrad,
preOutputValue,
preOutputGrad,
checkIg,
checkIgGrad,
checkFg,
checkFgGrad,
checkOg,
checkOgGrad,
outputGrad,
weight,
sequence,
active_node,
active_gate,
active_state);
} else if (frameSize == 64) { } else if (frameSize == 64) {
KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>> KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>>(
(gateValue, gateGrad, stateValue, stateGrad, preOutputValue, gateValue,
preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, gateGrad,
checkOgGrad, outputGrad, weight, sequence, stateValue,
active_node, active_gate, active_state); stateGrad,
preOutputValue,
preOutputGrad,
checkIg,
checkIgGrad,
checkFg,
checkFgGrad,
checkOg,
checkOgGrad,
outputGrad,
weight,
sequence,
active_node,
active_gate,
active_state);
} else if (frameSize == 128) { } else if (frameSize == 128) {
KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>> KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>>(
(gateValue, gateGrad, stateValue, stateGrad, preOutputValue, gateValue,
preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, gateGrad,
checkOgGrad, outputGrad, weight, sequence, stateValue,
active_node, active_gate, active_state); stateGrad,
preOutputValue,
preOutputGrad,
checkIg,
checkIgGrad,
checkFg,
checkFgGrad,
checkOg,
checkOgGrad,
outputGrad,
weight,
sequence,
active_node,
active_gate,
active_state);
} else if (frameSize == 256) { } else if (frameSize == 256) {
KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>> KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>>(
(gateValue, gateGrad, stateValue, stateGrad, preOutputValue, gateValue,
preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, gateGrad,
checkOgGrad, outputGrad, weight, sequence, stateValue,
active_node, active_gate, active_state); stateGrad,
preOutputValue,
preOutputGrad,
checkIg,
checkIgGrad,
checkFg,
checkFgGrad,
checkOg,
checkOgGrad,
outputGrad,
weight,
sequence,
active_node,
active_gate,
active_state);
} }
} else { } else {
if (frameSize == 32) { if (frameSize == 32) {
KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>> KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>>(
(gateValue, gateGrad, stateValue, stateGrad, preOutputValue, gateValue,
preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, gateGrad,
checkOgGrad, outputGrad, weight, sequence, stateValue,
active_node, active_gate, active_state); stateGrad,
preOutputValue,
preOutputGrad,
checkIg,
checkIgGrad,
checkFg,
checkFgGrad,
checkOg,
checkOgGrad,
outputGrad,
weight,
sequence,
active_node,
active_gate,
active_state);
} else if (frameSize == 64) { } else if (frameSize == 64) {
KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>> KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>>(
(gateValue, gateGrad, stateValue, stateGrad, preOutputValue, gateValue,
preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, gateGrad,
checkOgGrad, outputGrad, weight, sequence, stateValue,
active_node, active_gate, active_state); stateGrad,
preOutputValue,
preOutputGrad,
checkIg,
checkIgGrad,
checkFg,
checkFgGrad,
checkOg,
checkOgGrad,
outputGrad,
weight,
sequence,
active_node,
active_gate,
active_state);
} else if (frameSize == 128) { } else if (frameSize == 128) {
KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>> KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>>(
(gateValue, gateGrad, stateValue, stateGrad, preOutputValue, gateValue,
preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, gateGrad,
checkOgGrad, outputGrad, weight, sequence, stateValue,
active_node, active_gate, active_state); stateGrad,
preOutputValue,
preOutputGrad,
checkIg,
checkIgGrad,
checkFg,
checkFgGrad,
checkOg,
checkOgGrad,
outputGrad,
weight,
sequence,
active_node,
active_gate,
active_state);
} else if (frameSize == 256) { } else if (frameSize == 256) {
KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>> KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>>(
(gateValue, gateGrad, stateValue, stateGrad, preOutputValue, gateValue,
preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, gateGrad,
checkOgGrad, outputGrad, weight, sequence, stateValue,
active_node, active_gate, active_state); stateGrad,
preOutputValue,
preOutputGrad,
checkIg,
checkIgGrad,
checkFg,
checkFgGrad,
checkOg,
checkOgGrad,
outputGrad,
weight,
sequence,
active_node,
active_gate,
active_state);
} }
} }
CHECK_SYNC("hl_lstm_parallel_backward_data"); CHECK_SYNC("hl_lstm_parallel_backward_data");
} }
template<int B_X, int B_Y> template <int B_X, int B_Y>
__global__ void KeSetGradZero(real *gateGrad, __global__ void KeSetGradZero(real *gateGrad,
const int *starts, int valueSize, int numSequences, bool reversed) { const int *starts,
int valueSize,
int numSequences,
bool reversed) {
// const int tid = threadIdx.x; // const int tid = threadIdx.x;
const int frameIdx = blockIdx.x * B_X + threadIdx.x; const int frameIdx = blockIdx.x * B_X + threadIdx.x;
...@@ -682,19 +842,31 @@ void hl_lstm_parallel_backward_weight(real *weightGrad, ...@@ -682,19 +842,31 @@ void hl_lstm_parallel_backward_weight(real *weightGrad,
int valueSize = 4 * frameSize; int valueSize = 4 * frameSize;
dim3 threads(32, 32); dim3 threads(32, 32);
dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32); dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32);
KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>> KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
(gateGrad, sequence, valueSize, numSequences, reversed); gateGrad, sequence, valueSize, numSequences, reversed);
if (!reversed) { if (!reversed) {
hl_matrix_mul(outputValue, hl_matrix_mul(outputValue,
HPPL_OP_T, gateGrad + valueSize, HPPL_OP_N, weightGrad, HPPL_OP_T,
frameSize, valueSize, batchSize - 1, gateGrad + valueSize,
1.0, 1.0); HPPL_OP_N,
weightGrad,
frameSize,
valueSize,
batchSize - 1,
1.0,
1.0);
} else { } else {
hl_matrix_mul(outputValue + frameSize, hl_matrix_mul(outputValue + frameSize,
HPPL_OP_T, gateGrad, HPPL_OP_N, weightGrad, HPPL_OP_T,
frameSize, valueSize, batchSize - 1, gateGrad,
1.0, 1.0); HPPL_OP_N,
weightGrad,
frameSize,
valueSize,
batchSize - 1,
1.0,
1.0);
} }
CHECK_SYNC("hl_lstm_parallel_backward_weight"); CHECK_SYNC("hl_lstm_parallel_backward_weight");
} }
...@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
#include "hl_matrix.h" #include "hl_matrix.h"
#include "hl_matrix_ops.cuh"
#include "hl_matrix_apply.cuh" #include "hl_matrix_apply.cuh"
#include "hl_matrix_ops.cuh"
#include "hl_sequence.h" #include "hl_sequence.h"
#include "hl_sparse.ph" #include "hl_sparse.ph"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
DEFINE_MATRIX_UNARY_OP(Zero, a = 0); DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b); DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
void hl_matrix_add(real *A_d, void hl_matrix_add(real* A_d,
real *B_d, real* B_d,
real *C_d, real* C_d,
int dimM, int dimM,
int dimN, int dimN,
real alpha, real alpha,
...@@ -36,33 +35,32 @@ void hl_matrix_add(real *A_d, ...@@ -36,33 +35,32 @@ void hl_matrix_add(real *A_d,
CHECK_NOTNULL(B_d); CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_gpu_apply_ternary_op hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
<real, ternary::_add<real>, 0, 0>(ternary::_add<real>(alpha, beta), ternary::_add<real>(alpha, beta),
A_d, A_d,
B_d, B_d,
C_d, C_d,
dimM, dimM,
dimN, dimN,
dimN, dimN,
dimN, dimN,
dimN); dimN);
CHECK_SYNC("hl_matrix_add failed"); CHECK_SYNC("hl_matrix_add failed");
} }
#ifdef PADDLE_TYPE_DOUBLE #ifdef PADDLE_TYPE_DOUBLE
#define THRESHOLD 128 #define THRESHOLD 128
#else #else
#define THRESHOLD 64 #define THRESHOLD 64
#endif #endif
__device__ __forceinline__ __device__ __forceinline__ void findMax(real* I,
void findMax(real* I, real* dfMax_s,
real* dfMax_s, int blockSize,
int blockSize, int base,
int base, int curIdx,
int curIdx, int nextIdx,
int nextIdx, int dimN,
int dimN, real* max) {
real* max) {
dfMax_s[base] = -1.0e20; dfMax_s[base] = -1.0e20;
while (curIdx < dimN) { while (curIdx < dimN) {
if (dfMax_s[base] < I[nextIdx]) { if (dfMax_s[base] < I[nextIdx]) {
...@@ -78,25 +76,24 @@ void findMax(real* I, ...@@ -78,25 +76,24 @@ void findMax(real* I,
if (base < stride) { if (base < stride) {
nextIdx = base + stride; nextIdx = base + stride;
if (dfMax_s[base] < dfMax_s[nextIdx]) { if (dfMax_s[base] < dfMax_s[nextIdx]) {
dfMax_s[base] = dfMax_s[nextIdx]; dfMax_s[base] = dfMax_s[nextIdx];
} }
} }
} }
if (0 == base) { if (0 == base) {
max[0] = dfMax_s[0]; max[0] = dfMax_s[0];
} }
__syncthreads(); __syncthreads();
} }
__device__ __forceinline__ __device__ __forceinline__ void subMaxAndExp(real* I,
void subMaxAndExp(real* I, real* O,
real* O, int curIdx,
int curIdx, int nextIdx,
int nextIdx, int blockSize,
int blockSize, int dimN,
int dimN, real max) {
real max) {
real val; real val;
while (curIdx < dimN) { while (curIdx < dimN) {
val = I[nextIdx] - max; val = I[nextIdx] - max;
...@@ -115,14 +112,13 @@ void subMaxAndExp(real* I, ...@@ -115,14 +112,13 @@ void subMaxAndExp(real* I,
__syncthreads(); __syncthreads();
} }
__device__ __forceinline__ __device__ __forceinline__ void valueSum(real* O,
void valueSum(real* O, real* dfMax_s,
real* dfMax_s, int blockSize,
int blockSize, int base,
int base, int curIdx,
int curIdx, int nextIdx,
int nextIdx, int dimN) {
int dimN) {
dfMax_s[base] = 0; dfMax_s[base] = 0;
while (curIdx < dimN) { while (curIdx < dimN) {
dfMax_s[base] += O[nextIdx]; dfMax_s[base] += O[nextIdx];
...@@ -141,13 +137,8 @@ void valueSum(real* O, ...@@ -141,13 +137,8 @@ void valueSum(real* O,
__syncthreads(); __syncthreads();
} }
__device__ __forceinline__ __device__ __forceinline__ void divSum(
void divSum(real* O, real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
real sum,
int curIdx,
int nextIdx,
int blockSize,
int dimN) {
while (curIdx < dimN) { while (curIdx < dimN) {
O[nextIdx] /= sum; O[nextIdx] /= sum;
nextIdx += blockSize; nextIdx += blockSize;
...@@ -155,20 +146,18 @@ void divSum(real* O, ...@@ -155,20 +146,18 @@ void divSum(real* O,
} }
} }
__device__ __forceinline__ __device__ __forceinline__ void softmax(real* I,
void softmax(real* I, real* O,
real* O, real* dfMax_s,
real* dfMax_s, int blockSize,
int blockSize, int base,
int base, int curIdx,
int curIdx, int nextIdx,
int nextIdx, int dimN) {
int dimN) {
__shared__ real max; __shared__ real max;
// find the max number // find the max number
findMax(I, dfMax_s, blockSize, base, curIdx, findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
nextIdx, dimN, &max);
// sub max Value and do Exp operation // sub max Value and do Exp operation
subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max); subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
...@@ -181,8 +170,8 @@ void softmax(real* I, ...@@ -181,8 +170,8 @@ void softmax(real* I,
divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN); divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
} }
template<int blockSize> template <int blockSize>
__global__ void KeMatrixSoftMax(real *O, real *I, int dimN) { __global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
int base = threadIdx.x; int base = threadIdx.x;
__shared__ real dfMax_s[blockSize]; __shared__ real dfMax_s[blockSize];
int nextIdx = blockIdx.x * dimN + base; int nextIdx = blockIdx.x * dimN + base;
...@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) { ...@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN); softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
} }
void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) { void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
dim3 block(512, 1); dim3 block(512, 1);
dim3 grid(dimM, 1); dim3 grid(dimM, 1);
KeMatrixSoftMax<512> KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
<<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
CHECK_SYNC("hl_matrix_softmax failed"); CHECK_SYNC("hl_matrix_softmax failed");
} }
template<int blockSize> template <int blockSize>
__global__ void KeSequenceSoftMax(real *O, real *I, const int* index) { __global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
int base = threadIdx.x; int base = threadIdx.x;
int bid = blockIdx.x; int bid = blockIdx.x;
__shared__ real dfMax_s[blockSize]; __shared__ real dfMax_s[blockSize];
...@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) { ...@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN); softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
} }
void hl_sequence_softmax_forward(real *A_d, void hl_sequence_softmax_forward(real* A_d,
real *C_d, real* C_d,
const int* index, const int* index,
int numSequence) { int numSequence) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
...@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d, ...@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
dim3 block(512, 1); dim3 block(512, 1);
dim3 grid(numSequence, 1); dim3 grid(numSequence, 1);
KeSequenceSoftMax<512> KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
<<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
CHECK_SYNC("hl_sequence_softmax_forward failed"); CHECK_SYNC("hl_sequence_softmax_forward failed");
} }
__global__ void KeMatrixDerivative(real *grad_d, __global__ void KeMatrixDerivative(
real *output_d, real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
real *sftmaxSum_d, int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int dimM, int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
int dimN) {
int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
int index; int index;
if (rowIdx < dimM && colIdx < dimN) { if (rowIdx < dimM && colIdx < dimN) {
index = rowIdx*dimN + colIdx; index = rowIdx * dimN + colIdx;
grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]); grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
} }
} }
void hl_matrix_softmax_derivative(real *grad_d, void hl_matrix_softmax_derivative(
real *output_d, real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
real *sftmaxSum_d,
int dimM,
int dimN) {
CHECK_NOTNULL(grad_d); CHECK_NOTNULL(grad_d);
CHECK_NOTNULL(output_d); CHECK_NOTNULL(output_d);
CHECK_NOTNULL(sftmaxSum_d); CHECK_NOTNULL(sftmaxSum_d);
int blocksX = (dimM + 0) / 1; int blocksX = (dimM + 0) / 1;
int blocksY = (dimN + 1024 -1) / 1024; int blocksY = (dimN + 1024 - 1) / 1024;
dim3 threads(1, 1024); dim3 threads(1, 1024);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixDerivative<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_d, output_d, sftmaxSum_d, dimM, dimN); grad_d, output_d, sftmaxSum_d, dimM, dimN);
CHECK_SYNC("hl_matrix_softmax_derivative failed"); CHECK_SYNC("hl_matrix_softmax_derivative failed");
} }
__global__ void KeMatrixMultiBinaryCrossEntropy(real* output, __global__ void KeMatrixMultiBinaryCrossEntropy(
real* entropy, real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
int* row,
int* col,
int dimM,
int dimN) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < dimM) { if (index < dimM) {
for (int i = 0; i < dimN; i ++) { for (int i = 0; i < dimN; i++) {
entropy[index] -= log(1 - output[index * dimN + i]); entropy[index] -= log(1 - output[index * dimN + i]);
} }
int *row_col = col + row[index]; int* row_col = col + row[index];
int col_num = row[index + 1] - row[index]; int col_num = row[index + 1] - row[index];
for (int i = 0; i < col_num; i ++) { for (int i = 0; i < col_num; i++) {
real o = output[index * dimN + row_col[i]]; real o = output[index * dimN + row_col[i]];
entropy[index] -= log(o / (1 - o)); entropy[index] -= log(o / (1 - o));
} }
...@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output, ...@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
dim3 threads(n_threads); dim3 threads(n_threads);
dim3 grid(blocks); dim3 grid(blocks);
hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix); hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
KeMatrixMultiBinaryCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, entropy, mat->csr_row, mat->csr_col, dimM, dimN); output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed"); CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
} }
__global__ void KeMatrixMultiBinaryCrossEntropyBp(real* output, __global__ void KeMatrixMultiBinaryCrossEntropyBp(
real* grad, real* output, real* grad, int* row, int* col, int dimM, int dimN) {
int* row,
int* col,
int dimM,
int dimN) {
int row_idx = blockIdx.x * blockDim.x + threadIdx.x; int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (row_idx < dimM) { if (row_idx < dimM) {
for (int i = 0; i < dimN; i ++) { for (int i = 0; i < dimN; i++) {
int index = row_idx * dimN + i; int index = row_idx * dimN + i;
grad[index] += 1.0 / (1 - output[index]); grad[index] += 1.0 / (1 - output[index]);
} }
int col_num = row[row_idx + 1] - row[row_idx]; int col_num = row[row_idx + 1] - row[row_idx];
int *row_col = col + row[row_idx]; int* row_col = col + row[row_idx];
for (int i = 0; i < col_num; i ++) { for (int i = 0; i < col_num; i++) {
int index = row_idx * dimN + row_col[i]; int index = row_idx * dimN + row_col[i];
grad[index] -= 1.0 / (output[index] * (1 - output[index])); grad[index] -= 1.0 / (output[index] * (1 - output[index]));
} }
} }
} }
void hl_matrix_multi_binary_cross_entropy_bp(real* output, void hl_matrix_multi_binary_cross_entropy_bp(
real* grad, real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
hl_sparse_matrix_s csr_mat,
int dimM,
int dimN) {
CHECK_NOTNULL(output); CHECK_NOTNULL(output);
CHECK_NOTNULL(grad); CHECK_NOTNULL(grad);
CHECK_NOTNULL(csr_mat); CHECK_NOTNULL(csr_mat);
...@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output, ...@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
dim3 threads(n_threads); dim3 threads(n_threads);
dim3 grid(blocks); dim3 grid(blocks);
hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix); hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
KeMatrixMultiBinaryCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, grad, mat->csr_row, mat->csr_col, dimM, dimN); output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed"); CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
} }
__global__ void KeMatrixCrossEntropy(real* O, __global__ void KeMatrixCrossEntropy(
real* E, real* O, real* E, int* label, int dimM, int dimN) {
int* label,
int dimM,
int dimN) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
int newBase; int newBase;
if (index < dimM) { if (index < dimM) {
...@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O, ...@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
} }
} }
void hl_matrix_cross_entropy(real* A_d, void hl_matrix_cross_entropy(
real* C_d, real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
int* label_d,
int dimM,
int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
int blocks = (dimM + 1024 - 1) / 1024; int blocks = (dimM + 1024 - 1) / 1024;
dim3 threads(1024, 1); dim3 threads(1024, 1);
dim3 grid(blocks, 1); dim3 grid(blocks, 1);
KeMatrixCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
(A_d, C_d, label_d, dimM, dimN); A_d, C_d, label_d, dimM, dimN);
CHECK_SYNC("hl_matrix_cross_entropy failed"); CHECK_SYNC("hl_matrix_cross_entropy failed");
} }
__global__ void KeMatrixCrossEntropyBp(real* grad_d, __global__ void KeMatrixCrossEntropyBp(
real* output_d, real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
int* label_d, int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int dimM, int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
int dimN) {
int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
int index; int index;
if (rowIdx < dimM && colIdx < dimN) { if (rowIdx < dimM && colIdx < dimN) {
index = rowIdx*dimN + colIdx; index = rowIdx * dimN + colIdx;
if (label_d[rowIdx] == colIdx) { if (label_d[rowIdx] == colIdx) {
grad_d[index] -= 1.0f / output_d[index]; grad_d[index] -= 1.0f / output_d[index];
} }
} }
} }
void hl_matrix_cross_entropy_bp(real* grad_d, void hl_matrix_cross_entropy_bp(
real* output_d, real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
int* label_d,
int dimM,
int dimN) {
CHECK_NOTNULL(grad_d); CHECK_NOTNULL(grad_d);
CHECK_NOTNULL(output_d); CHECK_NOTNULL(output_d);
CHECK_NOTNULL(label_d); CHECK_NOTNULL(label_d);
int blocksX = (dimM + 0)/1; int blocksX = (dimM + 0) / 1;
int blocksY = (dimN + 1024 -1) / 1024; int blocksY = (dimN + 1024 - 1) / 1024;
dim3 threads(1, 1024); dim3 threads(1, 1024);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_d, output_d, label_d, dimM, dimN); grad_d, output_d, label_d, dimM, dimN);
CHECK_SYNC("hl_matrix_cross_entropy_bp failed"); CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
} }
void hl_matrix_zero_mem(real* data, int num) { void hl_matrix_zero_mem(real* data, int num) {
hl_gpu_apply_unary_op( hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
unary::Zero<real>(), data, 1, num, num);
} }
__global__ void KeParamReluForward(real* output, __global__ void KeParamReluForward(real* output,
...@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output, ...@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
int ty = blockIdx.y * blockDim.y + threadIdx.y; int ty = blockIdx.y * blockDim.y + threadIdx.y;
if (tx < width && ty < height) { if (tx < width && ty < height) {
int index = ty * width + tx; int index = ty * width + tx;
output[index] = input[index] > 0 ? input[index] : output[index] =
input[index] * w[tx / partial_sum]; input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
} }
} }
...@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output, ...@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
CHECK_NOTNULL(w); CHECK_NOTNULL(w);
dim3 threads(16, 16); dim3 threads(16, 16);
int blockX = (width + 16 - 1) / 16; int blockX = (width + 16 - 1) / 16;
int blockY = (height + 16 -1) / 16; int blockY = (height + 16 - 1) / 16;
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>> KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, input, w, width, height, partial_sum); output, input, w, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_forward failed"); CHECK_SYNC("hl_param_relu_forward failed");
} }
template<int blockSize> template <int blockSize>
__global__ void KeParamReluBackWardW(real* grad_w, __global__ void KeParamReluBackWardW(real* grad_w,
real* grad_o, real* grad_o,
real* input, real* input,
...@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w, ...@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
int grid_num = width / partial_sum; int grid_num = width / partial_sum;
dim3 threads(blockSize, 1); dim3 threads(blockSize, 1);
dim3 grid(grid_num, 1); dim3 grid(grid_num, 1);
KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>> KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_w, grad_o, input, width, height, partial_sum); grad_w, grad_o, input, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_backward_w failed"); CHECK_SYNC("hl_param_relu_backward_w failed");
} }
...@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o, ...@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
CHECK_NOTNULL(diff); CHECK_NOTNULL(diff);
dim3 threads(16, 16); dim3 threads(16, 16);
int blockX = (width + 16 - 1) / 16; int blockX = (width + 16 - 1) / 16;
int blockY = (height + 16 -1) / 16; int blockY = (height + 16 - 1) / 16;
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>> KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_o, data, w, diff, width, height, partial_sum); grad_o, data, w, diff, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_backward_diff failed"); CHECK_SYNC("hl_param_relu_backward_diff failed");
} }
__global__ void KeMatrixAddSharedBias(real* A, __global__ void KeMatrixAddSharedBias(
real* B, real* A, real* B, const int channel, const int M, const int N, real scale) {
const int channel,
const int M,
const int N,
real scale) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
int dim = N / channel; int dim = N / channel;
if (index < M * N) { if (index < M * N) {
...@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d, ...@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
real scale) { real scale) {
const int blocks = 512; const int blocks = 512;
const int grids = DIVUP(dimM * dimN, blocks); const int grids = DIVUP(dimM * dimN, blocks);
KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>> KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
(A_d, B_d, channel, dimM, dimN, scale); A_d, B_d, channel, dimM, dimN, scale);
CHECK_SYNC("hl_matrix_add_shared_bias failed"); CHECK_SYNC("hl_matrix_add_shared_bias failed");
} }
template <int blockSize> template <int blockSize>
__global__ void KeMatrixCollectSharedBias(real *B, __global__ void KeMatrixCollectSharedBias(real* B,
real *A, real* A,
const int channel, const int channel,
const int M, const int M,
const int N, const int N,
...@@ -589,7 +541,7 @@ __global__ void KeMatrixCollectSharedBias(real *B, ...@@ -589,7 +541,7 @@ __global__ void KeMatrixCollectSharedBias(real *B,
int n = j * blockSize + tid; int n = j * blockSize + tid;
int m = n / dim; int m = n / dim;
int w = n % dim; int w = n % dim;
smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0; smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
__syncthreads(); __syncthreads();
simpleReduce(smem, tid, blockSize); simpleReduce(smem, tid, blockSize);
sum += smem[0]; sum += smem[0];
...@@ -611,33 +563,32 @@ void hl_matrix_collect_shared_bias(real* B_d, ...@@ -611,33 +563,32 @@ void hl_matrix_collect_shared_bias(real* B_d,
const int limit = 64; const int limit = 64;
int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel; int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
KeMatrixCollectSharedBias<blocks> KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
<<< grids, blocks, 0, STREAM_DEFAULT>>> B_d, A_d, channel, dimM, dimN, dim, limit, scale);
(B_d, A_d, channel, dimM, dimN, dim, limit, scale);
CHECK_SYNC("hl_matrix_collect_shared_bias failed"); CHECK_SYNC("hl_matrix_collect_shared_bias failed");
} }
__global__ void keMatrixRotate(real* mat, real* matRot, __global__ void keMatrixRotate(
int dimM, int dimN, bool clockWise) { real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
int idx = blockIdx.x * blockDim.x + threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < dimM * dimN) { if (idx < dimM * dimN) {
int i = idx / dimN; int i = idx / dimN;
int j = idx % dimN; int j = idx % dimN;
if (clockWise) { if (clockWise) {
matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j]; matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
} else { } else {
matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)]; matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
}
} }
}
} }
void hl_matrix_rotate(real *mat, real* matRot, void hl_matrix_rotate(
int dimM, int dimN, bool clockWise) { real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
CHECK_NOTNULL(mat); CHECK_NOTNULL(mat);
CHECK_NOTNULL(matRot); CHECK_NOTNULL(matRot);
const int threads = 512; const int threads = 512;
const int blocks = DIVUP(dimM * dimN, threads); const int blocks = DIVUP(dimM * dimN, threads);
keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>> keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
(mat, matRot, dimM, dimN, clockWise); mat, matRot, dimM, dimN, clockWise);
CHECK_SYNC("hl_matrix_rotate failed"); CHECK_SYNC("hl_matrix_rotate failed");
} }
...@@ -16,36 +16,36 @@ limitations under the License. */ ...@@ -16,36 +16,36 @@ limitations under the License. */
#include "hl_device_functions.cuh" #include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
__global__ void KeMaxSequenceForward(real *input, __global__ void KeMaxSequenceForward(real* input,
const int *sequence, const int* sequence,
real* output, real* output,
int *index, int* index,
int numSequences, int numSequences,
int dim) { int dim) {
int dimIdx = threadIdx.x; int dimIdx = threadIdx.x;
int sequenceId = blockIdx.x; int sequenceId = blockIdx.x;
if (sequenceId >= numSequences) return; if (sequenceId >= numSequences) return;
int start = sequence[sequenceId]; int start = sequence[sequenceId];
int end = sequence[sequenceId+1]; int end = sequence[sequenceId + 1];
for (int i = dimIdx; i < dim; i += blockDim.x) { for (int i = dimIdx; i < dim; i += blockDim.x) {
real tmp = -HL_FLOAT_MAX; real tmp = -HL_FLOAT_MAX;
int tmpId = -1; int tmpId = -1;
for (int insId = start; insId < end; insId++) { for (int insId = start; insId < end; insId++) {
if (tmp < input[insId*dim + i]) { if (tmp < input[insId * dim + i]) {
tmp = input[insId*dim + i]; tmp = input[insId * dim + i];
tmpId = insId; tmpId = insId;
} }
} }
output[sequenceId*dim + i] = tmp; output[sequenceId * dim + i] = tmp;
index[sequenceId*dim + i] = tmpId; index[sequenceId * dim + i] = tmpId;
} }
} }
void hl_max_sequence_forward(real* input, void hl_max_sequence_forward(real* input,
const int* sequence, const int* sequence,
real* output, real* output,
int *index, int* index,
int numSequences, int numSequences,
int dim) { int dim) {
CHECK_NOTNULL(input); CHECK_NOTNULL(input);
...@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input, ...@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
dim3 threads(256, 1); dim3 threads(256, 1);
dim3 grid(numSequences, 1); dim3 grid(numSequences, 1);
KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>> KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, sequence, output, index, numSequences, dim); input, sequence, output, index, numSequences, dim);
CHECK_SYNC("hl_max_sequence_forward failed"); CHECK_SYNC("hl_max_sequence_forward failed");
} }
__global__ void KeMaxSequenceBackward(real *outputGrad, __global__ void KeMaxSequenceBackward(
int *index, real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
real* inputGrad,
int numSequences,
int dim) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
int colIdx = idx % dim; int colIdx = idx % dim;
if (idx < numSequences*dim) { if (idx < numSequences * dim) {
int insId = index[idx]; int insId = index[idx];
inputGrad[insId * dim + colIdx] += outputGrad[idx]; inputGrad[insId * dim + colIdx] += outputGrad[idx];
} }
} }
void hl_max_sequence_backward(real* outputGrad, void hl_max_sequence_backward(
int *index, real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
real* inputGrad,
int numSequences,
int dim) {
CHECK_NOTNULL(outputGrad); CHECK_NOTNULL(outputGrad);
CHECK_NOTNULL(index); CHECK_NOTNULL(index);
CHECK_NOTNULL(inputGrad); CHECK_NOTNULL(inputGrad);
...@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad, ...@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
unsigned int blocks = (numSequences * dim + 128 - 1) / 128; unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocks, 1); dim3 grid(blocks, 1);
KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>> KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
(outputGrad, index, inputGrad, numSequences, dim); outputGrad, index, inputGrad, numSequences, dim);
CHECK_SYNC("hl_max_sequence_backward failed"); CHECK_SYNC("hl_max_sequence_backward failed");
} }
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow> template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output, __global__ void KeMatrixAddRows(real* output,
real* table, real* table,
int* ids, int* ids,
...@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output, ...@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
while (sampleId < numSamples) { while (sampleId < numSamples) {
int tableId = ids[sampleId]; int tableId = ids[sampleId];
if ((0 <= tableId) && (tableId < tableSize)) { if ((0 <= tableId) && (tableId < tableSize)) {
real *outputData = output + sampleId * dim; real* outputData = output + sampleId * dim;
real *tableData = table + tableId * dim; real* tableData = table + tableId * dim;
for (int i = idx; i < dim; i += blockDimX) { for (int i = idx; i < dim; i += blockDimX) {
if (AddRow == 0) { if (AddRow == 0) {
outputData[i] += tableData[i]; outputData[i] += tableData[i];
...@@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output, ...@@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output,
} }
} }
} }
sampleId += blockDimY*gridDimX; sampleId += blockDimY * gridDimX;
} }
} }
template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd> template <int blockDimX,
__global__ int blockDimY,
void KeSequence2Batch(real *batch, int gridDimX,
real *sequence, bool seq2batch,
const int *batchIndex, bool isAdd>
int seqWidth, __global__ void KeSequence2Batch(real* batch,
int batchCount) { real* sequence,
const int* batchIndex,
int seqWidth,
int batchCount) {
int idx = threadIdx.x; int idx = threadIdx.x;
int idy = threadIdx.y; int idy = threadIdx.y;
int id = blockIdx.x + idy * gridDimX; int id = blockIdx.x + idy * gridDimX;
while (id < batchCount) { while (id < batchCount) {
int seqId = batchIndex[id]; int seqId = batchIndex[id];
real* batchData = batch + id*seqWidth; real* batchData = batch + id * seqWidth;
real* seqData = sequence + seqId*seqWidth; real* seqData = sequence + seqId * seqWidth;
for (int i = idx; i < seqWidth; i += blockDimX) { for (int i = idx; i < seqWidth; i += blockDimX) {
if (seq2batch) { if (seq2batch) {
if (isAdd) { if (isAdd) {
...@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch, ...@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
} }
} }
} }
id += blockDimY*gridDimX; id += blockDimY * gridDimX;
} }
} }
void hl_sequence2batch_copy(real *batch, void hl_sequence2batch_copy(real* batch,
real *sequence, real* sequence,
const int *batchIndex, const int* batchIndex,
int seqWidth, int seqWidth,
int batchCount, int batchCount,
bool seq2batch) { bool seq2batch) {
...@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch, ...@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
if (seq2batch) { if (seq2batch) {
KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} else { } else {
KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} }
CHECK_SYNC("hl_sequence2batch_copy failed"); CHECK_SYNC("hl_sequence2batch_copy failed");
} }
void hl_sequence2batch_add(real *batch, void hl_sequence2batch_add(real* batch,
real *sequence, real* sequence,
int *batchIndex, int* batchIndex,
int seqWidth, int seqWidth,
int batchCount, int batchCount,
bool seq2batch) { bool seq2batch) {
...@@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch, ...@@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
if (seq2batch) { if (seq2batch) {
KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} else { } else {
KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} }
CHECK_SYNC("hl_sequence2batch_add failed"); CHECK_SYNC("hl_sequence2batch_add failed");
} }
template<bool normByTimes, bool seq2batch> template <bool normByTimes, bool seq2batch>
__global__ __global__ void KeSequence2BatchPadding(real* batch,
void KeSequence2BatchPadding(real* batch, real* sequence,
real* sequence, const int* sequenceStartPositions,
const int* sequenceStartPositions, const size_t sequenceWidth,
const size_t sequenceWidth, const size_t maxSequenceLength,
const size_t maxSequenceLength, const size_t numSequences) {
const size_t numSequences) {
int batchIdx = blockIdx.y; int batchIdx = blockIdx.y;
int sequenceStart = sequenceStartPositions[batchIdx]; int sequenceStart = sequenceStartPositions[batchIdx];
int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart; int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
...@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch, ...@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
if (seq2batch) { if (seq2batch) {
/* sequence -> batch */ /* sequence -> batch */
if (normByTimes) { if (normByTimes) {
KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} else { } else {
KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} }
} else { } else {
/* batch -> sequence */ /* batch -> sequence */
if (normByTimes) { if (normByTimes) {
KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} else { } else {
KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} }
} }
CHECK_SYNC("hl_sequence2batch_copy_padding failed"); CHECK_SYNC("hl_sequence2batch_copy_padding failed");
} }
__device__ inline float my_rsqrt(float x) { __device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
return rsqrtf(x);
}
__device__ inline double my_rsqrt(double x) { __device__ inline double my_rsqrt(double x) { return rsqrt(x); }
return rsqrt(x);
}
__global__ void KeSequenceAvgForward(real* dst, __global__ void KeSequenceAvgForward(real* dst,
real* src, real* src,
...@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst, ...@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
sum += src[i * width + col]; sum += src[i * width + col];
} }
sum = mode == 1 ? sum : sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
(mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength)); : sum * my_rsqrt((real)seqLength));
dst[gid] += sum; dst[gid] += sum;
} }
} }
...@@ -347,10 +355,10 @@ void hl_sequence_avg_forward(real* dst, ...@@ -347,10 +355,10 @@ void hl_sequence_avg_forward(real* dst,
int grid = DIVUP(width * height, 512); int grid = DIVUP(width * height, 512);
CHECK(mode == 0 || mode == 1 || mode == 2) CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_forward!"; << "mode error in hl_sequence_avg_forward!";
KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>> KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
(dst, src, starts, height, width, mode); dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_forward failed"); CHECK_SYNC("hl_sequence_avg_forward failed");
} }
...@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst, ...@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
int seqLength = end - start; int seqLength = end - start;
if (seqLength == 0) return; if (seqLength == 0) return;
real grad = src[gid]; real grad = src[gid];
grad = mode == 1 ? grad : grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
(mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength)); : grad * my_rsqrt((real)seqLength));
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
dst[i * width + col] += grad; dst[i * width + col] += grad;
} }
...@@ -392,9 +400,9 @@ void hl_sequence_avg_backward(real* dst, ...@@ -392,9 +400,9 @@ void hl_sequence_avg_backward(real* dst,
int grid = DIVUP(width * height, 512); int grid = DIVUP(width * height, 512);
CHECK(mode == 0 || mode == 1 || mode == 2) CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_backward!"; << "mode error in hl_sequence_avg_backward!";
KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>> KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
(dst, src, starts, height, width, mode); dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_backward failed"); CHECK_SYNC("hl_sequence_avg_backward failed");
} }
...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_cuda.h" #include "hl_cuda.h"
#include "hl_cuda_sparse.cuh"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_ops.cuh"
#include "hl_sparse.h" #include "hl_sparse.h"
#include "hl_sparse.ph" #include "hl_sparse.ph"
#include "hl_matrix_ops.cuh"
#include "hl_matrix_apply.cuh"
#include "hl_cuda_sparse.cuh"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p); DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p);
...@@ -34,15 +33,15 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d, ...@@ -34,15 +33,15 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!"; CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!";
if (A_d->nnz == 0) { if (A_d->nnz == 0) {
hl_gpu_apply_unary_op( hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
unary::Zero<real>(), C_d, dimM, dimN, dimN);
return; return;
} }
/* nnz != 0 */ /* nnz != 0 */
hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix); hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && A_d2->csr_row &&
A_d2->csr_row && A_d2->csr_col) << "parameter transa error!"; A_d2->csr_col)
<< "parameter transa error!";
int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
...@@ -50,21 +49,11 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d, ...@@ -50,21 +49,11 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
if (A_d->type == HL_NO_VALUE) { if (A_d->type == HL_NO_VALUE) {
KeSMatrixCsr2Dense<0> KeSMatrixCsr2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csr_val, A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
A_d2->csr_row,
A_d2->csr_col,
C_d,
dimM,
dimN);
} else if (A_d->type == HL_FLOAT_VALUE) { } else if (A_d->type == HL_FLOAT_VALUE) {
KeSMatrixCsr2Dense<1> KeSMatrixCsr2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csr_val, A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
A_d2->csr_row,
A_d2->csr_col,
C_d,
dimM,
dimN);
} else { } else {
} }
CHECK_SYNC("hl_matrix_csr2dense failed"); CHECK_SYNC("hl_matrix_csr2dense failed");
...@@ -80,15 +69,15 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d, ...@@ -80,15 +69,15 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!"; CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!";
if (A_d->nnz == 0) { if (A_d->nnz == 0) {
hl_gpu_apply_unary_op( hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
unary::Zero<real>(), C_d, dimM, dimN, dimN);
return; return;
} }
/* nnz != 0 */ /* nnz != 0 */
hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix); hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && A_d2->csc_row &&
A_d2->csc_row && A_d2->csc_col) << "parameter transa error!"; A_d2->csc_col)
<< "parameter transa error!";
int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
...@@ -96,21 +85,11 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d, ...@@ -96,21 +85,11 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
if (A_d->type == HL_NO_VALUE) { if (A_d->type == HL_NO_VALUE) {
KeSMatrixCsc2Dense<0> KeSMatrixCsc2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csc_val, A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
A_d2->csc_row,
A_d2->csc_col,
C_d,
dimM,
dimN);
} else if (A_d->type == HL_FLOAT_VALUE) { } else if (A_d->type == HL_FLOAT_VALUE) {
KeSMatrixCsc2Dense<1> KeSMatrixCsc2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csc_val, A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
A_d2->csc_row,
A_d2->csc_col,
C_d,
dimM,
dimN);
} else { } else {
} }
CHECK_SYNC("hl_matrix_csc2dense failed"); CHECK_SYNC("hl_matrix_csc2dense failed");
...@@ -118,43 +97,43 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d, ...@@ -118,43 +97,43 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
hl_matrix_format_t format, hl_matrix_format_t format,
hl_matrix_value_t value_type, hl_matrix_value_t value_type,
int dimM, int dimM,
int dimN, int dimN,
int nnz) { int nnz) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC) CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
<< "sparse matrix format error!"; << "sparse matrix format error!";
CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE) CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE)
<< "sparse matrix value type error!"; << "sparse matrix value type error!";
/* avoid malloc 0 bytes */ /* avoid malloc 0 bytes */
int nnz_s = (nnz == 0 ? 1 : nnz); int nnz_s = (nnz == 0 ? 1 : nnz);
if (format == HL_SPARSE_CSR) { if (format == HL_SPARSE_CSR) {
CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) char *tmp =
+ sizeof(_hl_csr_matrix)); (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
CHECK_NOTNULL(tmp); CHECK_NOTNULL(tmp);
hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
csr->sparsity = -1.0; csr->sparsity = -1.0;
if (value_type == HL_NO_VALUE) { if (value_type == HL_NO_VALUE) {
csr->csr_val = NULL; csr->csr_val = NULL;
csr->nnz_s = nnz_s; csr->nnz_s = nnz_s;
csr->row_s = dimM+1; csr->row_s = dimM + 1;
csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int)); csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int)); csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
*A_d = (hl_sparse_matrix_s)tmp; *A_d = (hl_sparse_matrix_s)tmp;
(*A_d)->matrix = (hl_matrix_s)csr; (*A_d)->matrix = (hl_matrix_s)csr;
} else if (value_type == HL_FLOAT_VALUE) { } else if (value_type == HL_FLOAT_VALUE) {
csr->nnz_s = nnz_s; csr->nnz_s = nnz_s;
csr->row_s = dimM+1; csr->row_s = dimM + 1;
csr->csr_val = (real*)hl_malloc_device((nnz_s)*sizeof(real)); csr->csr_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int)); csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int)); csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
*A_d = (hl_sparse_matrix_s)tmp; *A_d = (hl_sparse_matrix_s)tmp;
(*A_d)->matrix = (hl_matrix_s)csr; (*A_d)->matrix = (hl_matrix_s)csr;
...@@ -162,28 +141,28 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, ...@@ -162,28 +141,28 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
} else if (format == HL_SPARSE_CSC) { } else if (format == HL_SPARSE_CSC) {
CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) char *tmp =
+ sizeof(_hl_csc_matrix)); (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
CHECK_NOTNULL(tmp); CHECK_NOTNULL(tmp);
hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
csc->sparsity = -1.0f; csc->sparsity = -1.0f;
if (value_type == HL_NO_VALUE) { if (value_type == HL_NO_VALUE) {
csc->csc_val = NULL; csc->csc_val = NULL;
csc->nnz_s = nnz_s; csc->nnz_s = nnz_s;
csc->col_s = dimN+1; csc->col_s = dimN + 1;
csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int)); csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int)); csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
*A_d = (hl_sparse_matrix_s)tmp; *A_d = (hl_sparse_matrix_s)tmp;
(*A_d)->matrix = (hl_matrix_s)csc; (*A_d)->matrix = (hl_matrix_s)csc;
} else if (value_type == HL_FLOAT_VALUE) { } else if (value_type == HL_FLOAT_VALUE) {
csc->nnz_s = nnz_s; csc->nnz_s = nnz_s;
csc->col_s = dimN+1; csc->col_s = dimN + 1;
csc->csc_val = (real*)hl_malloc_device((nnz_s)*sizeof(real)); csc->csc_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int)); csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int)); csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
*A_d = (hl_sparse_matrix_s)tmp; *A_d = (hl_sparse_matrix_s)tmp;
(*A_d)->matrix = (hl_matrix_s)csc; (*A_d)->matrix = (hl_matrix_s)csc;
...@@ -200,7 +179,7 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, ...@@ -200,7 +179,7 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) { void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC) CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC)
<< "sparse matrix format error!"; << "sparse matrix format error!";
if (A_d->matrix == NULL) { if (A_d->matrix == NULL) {
free(A_d); free(A_d);
...@@ -249,77 +228,77 @@ void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) { ...@@ -249,77 +228,77 @@ void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
} }
void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
void * dest_d, void *dest_d,
size_t size, size_t size,
hl_matrix_format_t format, hl_matrix_format_t format,
hl_matrix_value_t value_type, hl_matrix_value_t value_type,
int dimM, int dimM,
int dimN, int dimN,
int nnz) { int nnz) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC) CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
<< "sparse matrix format error!"; << "sparse matrix format error!";
if (format == HL_SPARSE_CSR) { if (format == HL_SPARSE_CSR) {
CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
size_t size_ = (dimM+1)*sizeof(int) + nnz*sizeof(int); size_t size_ = (dimM + 1) * sizeof(int) + nnz * sizeof(int);
if (value_type != HL_NO_VALUE) { if (value_type != HL_NO_VALUE) {
size_ += nnz*sizeof(real); size_ += nnz * sizeof(real);
} }
CHECK_LE(size_, size) << "dest_d size(" << size CHECK_LE(size_, size) << "dest_d size(" << size
<< ") too small, should bigger than(" << size_ << ")!"; << ") too small, should bigger than(" << size_
<< ")!";
char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) char *tmp =
+ sizeof(_hl_csr_matrix)); (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
CHECK_NOTNULL(tmp); CHECK_NOTNULL(tmp);
hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
if (value_type == HL_NO_VALUE) { if (value_type == HL_NO_VALUE) {
csr->csr_val = NULL; csr->csr_val = NULL;
csr->csr_row = (int*)dest_d; csr->csr_row = (int *)dest_d;
csr->csr_col = (int*)((char*)dest_d + (dimM+1)*sizeof(int)); csr->csr_col = (int *)((char *)dest_d + (dimM + 1) * sizeof(int));
} else { } else {
csr->csr_val = (real*)dest_d; csr->csr_val = (real *)dest_d;
csr->csr_row = (int*)((char*)dest_d + nnz*sizeof(real)); csr->csr_row = (int *)((char *)dest_d + nnz * sizeof(real));
csr->csr_col = (int*)((char*)dest_d + csr->csr_col = (int *)((char *)dest_d + nnz * sizeof(real) +
nnz*sizeof(real) + (dimM + 1) * sizeof(int));
(dimM+1)*sizeof(int));
} }
csr->nnz_s = nnz; csr->nnz_s = nnz;
csr->row_s = dimM+1; csr->row_s = dimM + 1;
csr->sparsity = -1.0; csr->sparsity = -1.0;
*A_d = (hl_sparse_matrix_s)tmp; *A_d = (hl_sparse_matrix_s)tmp;
(*A_d)->matrix = (hl_matrix_s)csr; (*A_d)->matrix = (hl_matrix_s)csr;
} else if (format == HL_SPARSE_CSC) { } else if (format == HL_SPARSE_CSC) {
CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
size_t size_ = (dimN+1)*sizeof(int) + nnz*sizeof(int); size_t size_ = (dimN + 1) * sizeof(int) + nnz * sizeof(int);
if (value_type != HL_NO_VALUE) { if (value_type != HL_NO_VALUE) {
size_ += nnz*sizeof(real); size_ += nnz * sizeof(real);
} }
CHECK_LE(size_, size) << "dest_d size(" << size CHECK_LE(size_, size) << "dest_d size(" << size
<< ") too small, should bigger than(" << size_ << ")!"; << ") too small, should bigger than(" << size_
<< ")!";
char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) char *tmp =
+ sizeof(_hl_csc_matrix)); (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
CHECK_NOTNULL(tmp); CHECK_NOTNULL(tmp);
hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
if (value_type == HL_NO_VALUE) { if (value_type == HL_NO_VALUE) {
csc->csc_val = NULL; csc->csc_val = NULL;
csc->csc_col = (int*)dest_d; csc->csc_col = (int *)dest_d;
csc->csc_row = (int*)((char*)dest_d + (dimN+1)*sizeof(int)); csc->csc_row = (int *)((char *)dest_d + (dimN + 1) * sizeof(int));
} else { } else {
csc->csc_val = (real*)dest_d; csc->csc_val = (real *)dest_d;
csc->csc_col = (int*)((char*)dest_d + nnz*sizeof(real)); csc->csc_col = (int *)((char *)dest_d + nnz * sizeof(real));
csc->csc_row = (int*)((char*)dest_d + csc->csc_row = (int *)((char *)dest_d + nnz * sizeof(real) +
nnz*sizeof(real) + (dimN + 1) * sizeof(int));
(dimN+1)*sizeof(int));
} }
csc->nnz_s = nnz; csc->nnz_s = nnz;
csc->col_s = dimN+1; csc->col_s = dimN + 1;
csc->sparsity = -1.0f; csc->sparsity = -1.0f;
*A_d = (hl_sparse_matrix_s)tmp; *A_d = (hl_sparse_matrix_s)tmp;
(*A_d)->matrix = (hl_matrix_s)csc; (*A_d)->matrix = (hl_matrix_s)csc;
...@@ -333,11 +312,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, ...@@ -333,11 +312,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
} }
void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
real* value_d, real *value_d,
int* rows_d, int *rows_d,
int* cols_d, int *cols_d,
hl_matrix_format_t format, hl_matrix_format_t format,
hl_matrix_value_t value_type, hl_matrix_value_t value_type,
int dimM, int dimM,
int dimN, int dimN,
int nnz) { int nnz) {
...@@ -345,11 +324,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, ...@@ -345,11 +324,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC) CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
<< "sparse matrix format error!"; << "sparse matrix format error!";
if (format == HL_SPARSE_CSR) { if (format == HL_SPARSE_CSR) {
char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) char *tmp =
+ sizeof(_hl_csr_matrix)); (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
CHECK_NOTNULL(tmp); CHECK_NOTNULL(tmp);
hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
...@@ -362,8 +341,8 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, ...@@ -362,8 +341,8 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
*A_d = (hl_sparse_matrix_s)tmp; *A_d = (hl_sparse_matrix_s)tmp;
(*A_d)->matrix = (hl_matrix_s)csr; (*A_d)->matrix = (hl_matrix_s)csr;
} else if (format == HL_SPARSE_CSC) { } else if (format == HL_SPARSE_CSC) {
char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) char *tmp =
+ sizeof(_hl_csc_matrix)); (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
CHECK_NOTNULL(tmp); CHECK_NOTNULL(tmp);
hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
...@@ -396,35 +375,30 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix, ...@@ -396,35 +375,30 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
hl_stream_t stream) { hl_stream_t stream) {
CHECK_NOTNULL(csr_matrix); CHECK_NOTNULL(csr_matrix);
CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR) CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
<< "csr_matrix is not csr format!"; << "csr_matrix is not csr format!";
CHECK_NOTNULL(csr_matrix->matrix); CHECK_NOTNULL(csr_matrix->matrix);
hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix); hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
CHECK_LE(csr_matrix->nnz, csr->nnz_s) CHECK_LE(csr_matrix->nnz, csr->nnz_s) << "copy size " << csr_matrix->nnz
<< "copy size " << csr_matrix->nnz << " is big than alloc size "
<< " is big than alloc size " << csr->nnz_s; << csr->nnz_s;
CHECK_LE((csr_matrix->rows+1), csr->row_s) CHECK_LE((csr_matrix->rows + 1), csr->row_s)
<< "copy size " << (csr_matrix->rows + 1) << "copy size " << (csr_matrix->rows + 1) << " is big than alloc size "
<< " is big than alloc size " << csr->row_s; << csr->row_s;
CHECK(csr_matrix->type == HL_FLOAT_VALUE || CHECK(csr_matrix->type == HL_FLOAT_VALUE || csr_matrix->type == HL_NO_VALUE)
csr_matrix->type == HL_NO_VALUE) << "sparse matrix value type error!";
<< "sparse matrix value type error!";
if (csr_matrix->type == HL_NO_VALUE) { if (csr_matrix->type == HL_NO_VALUE) {
if (csr_row == NULL && csr_col == NULL) { if (csr_row == NULL && csr_col == NULL) {
return; return;
} else if (csr_row != NULL && csr_col != NULL) { } else if (csr_row != NULL && csr_col != NULL) {
hl_memcpy_async(csr->csr_row, hl_memcpy_async(
csr_row, csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
(csr_matrix->rows+1)*sizeof(int),
stream);
hl_memcpy_async(csr->csr_col, hl_memcpy_async(
csr_col, csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
(csr_matrix->nnz)*sizeof(int),
stream);
} else { } else {
LOG(FATAL) << "parameter csr_row or csr_col is null pointer!"; LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
} }
...@@ -432,30 +406,21 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix, ...@@ -432,30 +406,21 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
if (csr_val == NULL && csr_row == NULL && csr_col == NULL) { if (csr_val == NULL && csr_row == NULL && csr_col == NULL) {
return; return;
} else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) { } else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) {
hl_memcpy_async(csr->csr_val, hl_memcpy_async(
csr_val, csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
(csr_matrix->nnz)*sizeof(real),
stream);
} else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) { } else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) {
hl_memcpy_async(csr->csr_val, hl_memcpy_async(
csr_val, csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
(csr_matrix->nnz)*sizeof(real), hl_memcpy_async(
stream); csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
hl_memcpy_async(csr->csr_row, hl_memcpy_async(
csr_row, csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
(csr_matrix->rows+1)*sizeof(int),
stream);
hl_memcpy_async(csr->csr_col,
csr_col,
(csr_matrix->nnz)*sizeof(int),
stream);
} else { } else {
LOG(FATAL) << "parameter csr_row or csr_col is null pointer!"; LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
} }
} }
csr->sparsity = ((float)csr_matrix->nnz) / csr->sparsity = ((float)csr_matrix->nnz) / ((float)csr_matrix->rows) /
((float)csr_matrix->rows) /
((float)csr_matrix->cols); ((float)csr_matrix->cols);
} }
...@@ -466,33 +431,28 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix, ...@@ -466,33 +431,28 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
hl_stream_t stream) { hl_stream_t stream) {
CHECK_NOTNULL(csc_matrix); CHECK_NOTNULL(csc_matrix);
CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC) CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
<< "csc_matrix is not csc format error!"; << "csc_matrix is not csc format error!";
hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix); hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
CHECK_LE(csc_matrix->nnz, csc->nnz_s) CHECK_LE(csc_matrix->nnz, csc->nnz_s) << "copy size " << csc_matrix->nnz
<< "copy size " << csc_matrix->nnz << " is big than alloc size "
<< " is big than alloc size " << csc->nnz_s; << csc->nnz_s;
CHECK_LE((csc_matrix->cols+1), csc->col_s) CHECK_LE((csc_matrix->cols + 1), csc->col_s)
<< "copy size " <<(csc_matrix->cols + 1) << "copy size " << (csc_matrix->cols + 1) << " is big than alloc size "
<< " is big than alloc size " << csc->col_s; << csc->col_s;
CHECK(csc_matrix->type == HL_FLOAT_VALUE || CHECK(csc_matrix->type == HL_FLOAT_VALUE || csc_matrix->type == HL_NO_VALUE)
csc_matrix->type == HL_NO_VALUE) << "sparse matrix value type error!";
<< "sparse matrix value type error!";
if (csc_matrix->type == HL_NO_VALUE) { if (csc_matrix->type == HL_NO_VALUE) {
if (csc_row == NULL && csc_col == NULL) { if (csc_row == NULL && csc_col == NULL) {
return; return;
} else if (csc_row != NULL && csc_col != NULL) { } else if (csc_row != NULL && csc_col != NULL) {
hl_memcpy_async(csc->csc_row, hl_memcpy_async(
csc_row, csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
(csc_matrix->nnz)*sizeof(int), hl_memcpy_async(
stream); csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
hl_memcpy_async(csc->csc_col,
csc_col,
(csc_matrix->cols+1)*sizeof(int),
stream);
} else { } else {
LOG(FATAL) << "parameter csc_row or csc_col is null pointer!"; LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
} }
...@@ -500,30 +460,21 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix, ...@@ -500,30 +460,21 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
if (csc_val == NULL && csc_row == NULL && csc_col == NULL) { if (csc_val == NULL && csc_row == NULL && csc_col == NULL) {
return; return;
} else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) { } else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) {
hl_memcpy_async(csc->csc_val, hl_memcpy_async(
csc_val, csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
(csc_matrix->nnz)*sizeof(real),
stream);
} else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) { } else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) {
hl_memcpy_async(csc->csc_val, hl_memcpy_async(
csc_val, csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
(csc_matrix->nnz)*sizeof(real), hl_memcpy_async(
stream); csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
hl_memcpy_async(csc->csc_row, hl_memcpy_async(
csc_row, csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
(csc_matrix->nnz)*sizeof(int),
stream);
hl_memcpy_async(csc->csc_col,
csc_col,
(csc_matrix->cols+1)*sizeof(int),
stream);
} else { } else {
LOG(FATAL) << "parameter csc_row or csc_col is null pointer!"; LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
} }
} }
csc->sparsity = ((float)csc_matrix->nnz) / csc->sparsity = ((float)csc_matrix->nnz) / ((float)csc_matrix->rows) /
((float)csc_matrix->rows) /
((float)csc_matrix->cols); ((float)csc_matrix->cols);
} }
...@@ -531,32 +482,23 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst, ...@@ -531,32 +482,23 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
hl_sparse_matrix_s src, hl_sparse_matrix_s src,
hl_stream_t stream) { hl_stream_t stream) {
CHECK(dst && src && dst->matrix && src->matrix) CHECK(dst && src && dst->matrix && src->matrix)
<< "parameter dst or src is null pointer!"; << "parameter dst or src is null pointer!";
CHECK_EQ(dst->format, src->format) CHECK_EQ(dst->format, src->format) << "sparse matrix format does not match!";
<< "sparse matrix format does not match!";
CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE) CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE)
<< "src sparse matrix is no value, dst sparse matrix has value!"; << "src sparse matrix is no value, dst sparse matrix has value!";
if (dst->format == HL_SPARSE_CSR) { if (dst->format == HL_SPARSE_CSR) {
dst->rows = src->rows; dst->rows = src->rows;
dst->cols = src->cols; dst->cols = src->cols;
dst->nnz = src->nnz; dst->nnz = src->nnz;
hl_csr_matrix csr = (hl_csr_matrix)src->matrix; hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
hl_memcpy_csr_matrix(dst, hl_memcpy_csr_matrix(dst, csr->csr_val, csr->csr_row, csr->csr_col, stream);
csr->csr_val,
csr->csr_row,
csr->csr_col,
stream);
} else if (dst->format == HL_SPARSE_CSC) { } else if (dst->format == HL_SPARSE_CSC) {
dst->rows = src->rows; dst->rows = src->rows;
dst->cols = src->cols; dst->cols = src->cols;
dst->nnz = src->nnz; dst->nnz = src->nnz;
hl_csc_matrix csc = (hl_csc_matrix)src->matrix; hl_csc_matrix csc = (hl_csc_matrix)src->matrix;
hl_memcpy_csc_matrix(dst, hl_memcpy_csc_matrix(dst, csc->csc_val, csc->csc_row, csc->csc_col, stream);
csc->csc_val,
csc->csc_row,
csc->csc_col,
stream);
} else { } else {
LOG(FATAL) << "sparse matrix format error!"; LOG(FATAL) << "sparse matrix format error!";
} }
...@@ -569,20 +511,24 @@ static void _beta_mul_c(real *c, int dimM, int dimN, real beta) { ...@@ -569,20 +511,24 @@ static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
if (beta == 0.0) { if (beta == 0.0) {
hl_gpu_apply_unary_op(unary::Zero<real>(), c, dimM, dimN, dimN); hl_gpu_apply_unary_op(unary::Zero<real>(), c, dimM, dimN, dimN);
} else { } else {
if (beta != 1.0){ if (beta != 1.0) {
hl_gpu_apply_unary_op( hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
} }
} }
return; return;
} }
void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
real *B_d, hl_trans_op_t transb, hl_trans_op_t transa,
real *B_d,
hl_trans_op_t transb,
real *C_d, real *C_d,
int dimM, int dimN, int dimK, int dimM,
real alpha, real beta) { int dimN,
int dimK,
real alpha,
real beta) {
CHECK_EQ(transb, HPPL_OP_N); CHECK_EQ(transb, HPPL_OP_N);
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d); CHECK_NOTNULL(B_d);
...@@ -592,7 +538,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, ...@@ -592,7 +538,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) || if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
(HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) { (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
LOG(FATAL) << "parameter error!"; LOG(FATAL) << "parameter error!";
} }
if (A_d->nnz == 0) { if (A_d->nnz == 0) {
...@@ -603,8 +549,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, ...@@ -603,8 +549,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
/* nnz != 0 */ /* nnz != 0 */
hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix); hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) || if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
A_d2->csr_row == NULL || A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
A_d2->csr_col == NULL) {
LOG(FATAL) << "parameter error!"; LOG(FATAL) << "parameter error!";
} }
...@@ -617,63 +562,63 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, ...@@ -617,63 +562,63 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
/* sparsity pattern */ /* sparsity pattern */
// A_d->sparsity; // A_d->sparsity;
if (A_d->type == HL_NO_VALUE) { if (A_d->type == HL_NO_VALUE) {
KeSMatrixCsrMulDense<0> KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d2->csr_val, A_d2->csr_val,
A_d2->csr_col, A_d2->csr_col,
A_d2->csr_row, A_d2->csr_row,
B_d, B_d,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} else { } else {
KeSMatrixCsrMulDense<1> KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d2->csr_val, A_d2->csr_val,
A_d2->csr_col, A_d2->csr_col,
A_d2->csr_row, A_d2->csr_row,
B_d, B_d,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} }
} else if (HPPL_OP_T == transa) { } else if (HPPL_OP_T == transa) {
_beta_mul_c(C_d, dimM, dimN, beta); _beta_mul_c(C_d, dimM, dimN, beta);
int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / int blocksX =
CU_CSC_MUL_DENSE_BLOCK_N; (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / int blocksY =
CU_CSC_MUL_DENSE_BLOCK_K; (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y); dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
if (A_d->type == HL_NO_VALUE) { if (A_d->type == HL_NO_VALUE) {
KeSMatrixCscMulDense<0> KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d2->csr_val, A_d2->csr_val,
A_d2->csr_col, A_d2->csr_col,
A_d2->csr_row, A_d2->csr_row,
B_d, B_d,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} else { } else {
KeSMatrixCscMulDense<1> KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d2->csr_val, A_d2->csr_val,
A_d2->csr_col, A_d2->csr_col,
A_d2->csr_row, A_d2->csr_row,
B_d, B_d,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} }
} else { } else {
LOG(FATAL) << "parameter transa error!"; LOG(FATAL) << "parameter transa error!";
...@@ -682,11 +627,16 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, ...@@ -682,11 +627,16 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
CHECK_SYNC("hl_matrix_csr_mul_dense failed"); CHECK_SYNC("hl_matrix_csr_mul_dense failed");
} }
void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, void hl_matrix_dense_mul_csc(real *A_d,
hl_sparse_matrix_s B_d, hl_trans_op_t transb, hl_trans_op_t transa,
hl_sparse_matrix_s B_d,
hl_trans_op_t transb,
real *C_d, real *C_d,
int dimM, int dimN, int dimK, int dimM,
real alpha, real beta) { int dimN,
int dimK,
real alpha,
real beta) {
CHECK_EQ(transa, HPPL_OP_N); CHECK_EQ(transa, HPPL_OP_N);
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d); CHECK_NOTNULL(B_d);
...@@ -698,8 +648,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, ...@@ -698,8 +648,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
LOG(FATAL) << "parameter dims error!"; LOG(FATAL) << "parameter dims error!";
} }
CHECK_EQ(B_d->format, HL_SPARSE_CSC) CHECK_EQ(B_d->format, HL_SPARSE_CSC) << "matrix format error!";
<< "matrix format error!";
if (B_d->nnz == 0) { if (B_d->nnz == 0) {
_beta_mul_c(C_d, dimM, dimN, beta); _beta_mul_c(C_d, dimM, dimN, beta);
...@@ -709,8 +658,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, ...@@ -709,8 +658,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
/* nnz != 0 */ /* nnz != 0 */
hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix); hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix);
if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) || if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) ||
B_d2->csc_row == NULL || B_d2->csc_row == NULL || B_d2->csc_col == NULL) {
B_d2->csc_col == NULL) {
LOG(FATAL) << "parameter B is null!"; LOG(FATAL) << "parameter B is null!";
} }
...@@ -721,60 +669,60 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, ...@@ -721,60 +669,60 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
if (B_d->type == HL_NO_VALUE) { if (B_d->type == HL_NO_VALUE) {
KeSMatrixDenseMulCsc<0> KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d, A_d,
B_d2->csc_val, B_d2->csc_val,
B_d2->csc_row, B_d2->csc_row,
B_d2->csc_col, B_d2->csc_col,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} else { } else {
KeSMatrixDenseMulCsc<1> KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d, A_d,
B_d2->csc_val, B_d2->csc_val,
B_d2->csc_row, B_d2->csc_row,
B_d2->csc_col, B_d2->csc_col,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} }
} else if (transb == HPPL_OP_T) { } else if (transb == HPPL_OP_T) {
_beta_mul_c(C_d, dimM, dimN, beta); _beta_mul_c(C_d, dimM, dimN, beta);
int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X; int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M; int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y); dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
if (B_d->type == HL_NO_VALUE) { if (B_d->type == HL_NO_VALUE) {
KeSMatrixDenseMulCsr<0> KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d, A_d,
B_d2->csc_val, B_d2->csc_val,
B_d2->csc_col, B_d2->csc_col,
B_d2->csc_row, B_d2->csc_row,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} else { } else {
KeSMatrixDenseMulCsr<1> KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d, A_d,
B_d2->csc_val, B_d2->csc_val,
B_d2->csc_col, B_d2->csc_col,
B_d2->csc_row, B_d2->csc_row,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} }
} else { } else {
LOG(FATAL) << "parameter transb error!"; LOG(FATAL) << "parameter transb error!";
...@@ -783,24 +731,28 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, ...@@ -783,24 +731,28 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
CHECK_SYNC("hl_matrix_dense_mul_csc failed"); CHECK_SYNC("hl_matrix_dense_mul_csc failed");
} }
void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, void hl_matrix_dense_mul_csr(real *A_d,
hl_sparse_matrix_s B_d, hl_trans_op_t transb, hl_trans_op_t transa,
hl_sparse_matrix_s B_d,
hl_trans_op_t transb,
real *C_d, real *C_d,
int dimM, int dimN, int dimK, int dimM,
real alpha, real beta) { int dimN,
int dimK,
real alpha,
real beta) {
CHECK_EQ(transa, HPPL_OP_N); CHECK_EQ(transa, HPPL_OP_N);
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d); CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
if (dimM <= 0 || dimN <= 0 || dimK <= 0 if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
|| (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) ||
|| (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) { (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) {
LOG(FATAL) << "parameter dims error!"; LOG(FATAL) << "parameter dims error!";
} }
CHECK_EQ(B_d->format, HL_SPARSE_CSR) CHECK_EQ(B_d->format, HL_SPARSE_CSR) << "matrix format error!";
<< "matrix format error!";
if (B_d->nnz == 0) { if (B_d->nnz == 0) {
_beta_mul_c(C_d, dimM, dimN, beta); _beta_mul_c(C_d, dimM, dimN, beta);
...@@ -810,41 +762,40 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, ...@@ -810,41 +762,40 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
/* nnz != 0 */ /* nnz != 0 */
hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix); hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) || if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
B_d2->csr_row == NULL || B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
B_d2->csr_col == NULL) {
LOG(FATAL) << "parameter transa error!"; LOG(FATAL) << "parameter transa error!";
} }
if (transb == HPPL_OP_N) { if (transb == HPPL_OP_N) {
_beta_mul_c(C_d, dimM, dimN, beta); _beta_mul_c(C_d, dimM, dimN, beta);
int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X; int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M; int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y); dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
if (B_d->type == HL_NO_VALUE) { if (B_d->type == HL_NO_VALUE) {
KeSMatrixDenseMulCsr<0> KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d, A_d,
B_d2->csr_val, B_d2->csr_val,
B_d2->csr_row, B_d2->csr_row,
B_d2->csr_col, B_d2->csr_col,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} else { } else {
KeSMatrixDenseMulCsr<1> KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d, A_d,
B_d2->csr_val, B_d2->csr_val,
B_d2->csr_row, B_d2->csr_row,
B_d2->csr_col, B_d2->csr_col,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} }
} else if (transb == HPPL_OP_T) { } else if (transb == HPPL_OP_T) {
int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST; int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
...@@ -852,29 +803,29 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, ...@@ -852,29 +803,29 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST); dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
if (B_d->type == HL_NO_VALUE) { if (B_d->type == HL_NO_VALUE) {
KeSMatrixDenseMulCsc<0> KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d, A_d,
B_d2->csr_val, B_d2->csr_val,
B_d2->csr_col, B_d2->csr_col,
B_d2->csr_row, B_d2->csr_row,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} else { } else {
KeSMatrixDenseMulCsc<1> KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d, A_d,
B_d2->csr_val, B_d2->csr_val,
B_d2->csr_col, B_d2->csr_col,
B_d2->csr_row, B_d2->csr_row,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} }
} else { } else {
LOG(FATAL) << "parameter transb error!"; LOG(FATAL) << "parameter transb error!";
...@@ -883,11 +834,16 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, ...@@ -883,11 +834,16 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
CHECK_SYNC("hl_matrix_dense_mul_csr failed"); CHECK_SYNC("hl_matrix_dense_mul_csr failed");
} }
void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
real *B_d, hl_trans_op_t transb, hl_trans_op_t transa,
real *B_d,
hl_trans_op_t transb,
real *C_d, real *C_d,
int dimM, int dimN, int dimK, int dimM,
real alpha, real beta) { int dimN,
int dimK,
real alpha,
real beta) {
CHECK_EQ(transb, HPPL_OP_N); CHECK_EQ(transb, HPPL_OP_N);
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d); CHECK_NOTNULL(B_d);
...@@ -908,42 +864,43 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, ...@@ -908,42 +864,43 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
/* nnz != 0 */ /* nnz != 0 */
hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix); hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) || if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) ||
A_d2->csc_row == NULL || A_d2->csc_row == NULL || A_d2->csc_col == NULL) {
A_d2->csc_col == NULL) {
LOG(FATAL) << "parameter error!"; LOG(FATAL) << "parameter error!";
} }
if (HPPL_OP_N == transa) { if (HPPL_OP_N == transa) {
_beta_mul_c(C_d, dimM, dimN, beta); _beta_mul_c(C_d, dimM, dimN, beta);
int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N -1)/CU_CSC_MUL_DENSE_BLOCK_N; int blocksX =
int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K -1)/CU_CSC_MUL_DENSE_BLOCK_K; (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
int blocksY =
(dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y); dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
if (A_d->type == HL_NO_VALUE) { if (A_d->type == HL_NO_VALUE) {
KeSMatrixCscMulDense<0> KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d2->csc_val, A_d2->csc_val,
A_d2->csc_row, A_d2->csc_row,
A_d2->csc_col, A_d2->csc_col,
B_d, B_d,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} else { } else {
KeSMatrixCscMulDense<1> KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d2->csc_val, A_d2->csc_val,
A_d2->csc_row, A_d2->csc_row,
A_d2->csc_col, A_d2->csc_col,
B_d, B_d,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} }
} else if (HPPL_OP_T == transa) { } else if (HPPL_OP_T == transa) {
int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N; int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
...@@ -954,29 +911,29 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, ...@@ -954,29 +911,29 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
/* sparsity pattern */ /* sparsity pattern */
// A_d->sparsity; // A_d->sparsity;
if (A_d->type == HL_NO_VALUE) { if (A_d->type == HL_NO_VALUE) {
KeSMatrixCsrMulDense<0> KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d2->csc_val, A_d2->csc_val,
A_d2->csc_row, A_d2->csc_row,
A_d2->csc_col, A_d2->csc_col,
B_d, B_d,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} else { } else {
KeSMatrixCsrMulDense<1> KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d, C_d,
A_d2->csc_val, A_d2->csc_val,
A_d2->csc_row, A_d2->csc_row,
A_d2->csc_col, A_d2->csc_col,
B_d, B_d,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
} }
} else { } else {
LOG(FATAL) << "parameter transa error!"; LOG(FATAL) << "parameter transa error!";
...@@ -985,11 +942,16 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, ...@@ -985,11 +942,16 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
CHECK_SYNC("hl_matrix_csc_mul_dense failed"); CHECK_SYNC("hl_matrix_csc_mul_dense failed");
} }
void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, void hl_sparse_matrix_mul(real *A_d,
real *B_d, hl_trans_op_t transb, hl_trans_op_t transa,
hl_sparse_matrix_s C_d, real *B_d,
int dimM, int dimN, int dimK, hl_trans_op_t transb,
real alpha, real beta) { hl_sparse_matrix_s C_d,
int dimM,
int dimN,
int dimK,
real alpha,
real beta) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d); CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
...@@ -1000,18 +962,14 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, ...@@ -1000,18 +962,14 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
if (C_d->format == HL_SPARSE_CSC) { if (C_d->format == HL_SPARSE_CSC) {
hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix); hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix);
if (C_d2->csc_val == NULL || if (C_d2->csc_val == NULL || C_d2->csc_row == NULL ||
C_d2->csc_row == NULL ||
C_d2->csc_col == NULL) { C_d2->csc_col == NULL) {
LOG(FATAL) << "parameter error!"; LOG(FATAL) << "parameter error!";
} }
if (beta != 1.0) { if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta), hl_gpu_apply_unary_op(
C_d2->csc_val, unary::mul_scalar<real>(beta), C_d2->csc_val, 1, C_d->nnz, C_d->nnz);
1,
C_d->nnz,
C_d->nnz);
} }
int blocksX = dimN; int blocksX = dimN;
...@@ -1020,34 +978,30 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, ...@@ -1020,34 +978,30 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
bool transA = transa == HPPL_OP_T ? 1 : 0; bool transA = transa == HPPL_OP_T ? 1 : 0;
bool transB = transb == HPPL_OP_T ? 1 : 0; bool transB = transb == HPPL_OP_T ? 1 : 0;
KeSMatrixDenseMulDense2CSC KeSMatrixDenseMulDense2CSC<<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d2->csc_val, C_d2->csc_val,
C_d2->csc_row, C_d2->csc_row,
C_d2->csc_col, C_d2->csc_col,
A_d, A_d,
B_d, B_d,
transA, transA,
transB, transB,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
CHECK_SYNC("hl_sparse_matrix_mul failed"); CHECK_SYNC("hl_sparse_matrix_mul failed");
} else { } else {
hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix); hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix);
if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) || if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) ||
C_d2->csr_row == NULL || C_d2->csr_row == NULL || C_d2->csr_col == NULL) {
C_d2->csr_col == NULL) {
LOG(FATAL) << "parameter error!"; LOG(FATAL) << "parameter error!";
} }
if (beta != 1.0) { if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta), hl_gpu_apply_unary_op(
C_d2->csr_val, unary::mul_scalar<real>(beta), C_d2->csr_val, 1, C_d->nnz, C_d->nnz);
1,
C_d->nnz,
C_d->nnz);
} }
bool transA = transa == HPPL_OP_T ? 1 : 0; bool transA = transa == HPPL_OP_T ? 1 : 0;
...@@ -1058,20 +1012,20 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, ...@@ -1058,20 +1012,20 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1); dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeSMatrixDenseMulDense2CSR KeSMatrixDenseMulDense2CSR<<<grid, threads, 0, STREAM_DEFAULT>>>(
<<<grid, threads, 0, STREAM_DEFAULT>>>(C_d2->csr_val, C_d2->csr_val,
C_d2->csr_row, C_d2->csr_row,
C_d2->csr_col, C_d2->csr_col,
A_d, A_d,
B_d, B_d,
transA, transA,
transB, transB,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
CHECK_SYNC("hl_sparse_matrix_mul failed"); CHECK_SYNC("hl_sparse_matrix_mul failed");
} else { } else {
CHECK(!transA) << "Not supported A is trans and B is not trans!"; CHECK(!transA) << "Not supported A is trans and B is not trans!";
...@@ -1080,21 +1034,21 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, ...@@ -1080,21 +1034,21 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1; avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1;
int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE); int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE);
dim3 grid(gridx, dimM); dim3 grid(gridx, dimM);
KeSMatrixDenseMulDenseTrans2CSR KeSMatrixDenseMulDenseTrans2CSR<<<grid, block, 0, STREAM_DEFAULT>>>(
<<<grid, block, 0, STREAM_DEFAULT>>>(C_d2->csr_val, C_d2->csr_val,
C_d2->csr_row, C_d2->csr_row,
C_d2->csr_col, C_d2->csr_col,
A_d, A_d,
B_d, B_d,
transA, transA,
transB, transB,
dimM, dimM,
dimN, dimN,
dimK, dimK,
alpha, alpha,
beta); beta);
CHECK_SYNC("hl_sparse_matrix_mul failed"); CHECK_SYNC("hl_sparse_matrix_mul failed");
} }
} }
} }
...@@ -1111,7 +1065,7 @@ void hl_memcpy_from_csc_matrix(real *csc_val, ...@@ -1111,7 +1065,7 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
CHECK_NOTNULL(csc_col); CHECK_NOTNULL(csc_col);
CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC) CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
<< "csc_matrix is not csc format error!"; << "csc_matrix is not csc format error!";
if (csc_matrix->nnz > row_size || if (csc_matrix->nnz > row_size ||
csc_matrix->cols + 1 > static_cast<int>(col_size)) { csc_matrix->cols + 1 > static_cast<int>(col_size)) {
...@@ -1119,20 +1073,20 @@ void hl_memcpy_from_csc_matrix(real *csc_val, ...@@ -1119,20 +1073,20 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
} }
hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix); hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
hl_memcpy_async((void*)csc_row, hl_memcpy_async((void *)csc_row,
(void*)csc->csc_row, (void *)csc->csc_row,
(csc_matrix->nnz) * sizeof(int), (csc_matrix->nnz) * sizeof(int),
stream); stream);
hl_memcpy_async((void*)csc_col, hl_memcpy_async((void *)csc_col,
(void*)csc->csc_col, (void *)csc->csc_col,
(csc_matrix->cols + 1) * sizeof(int), (csc_matrix->cols + 1) * sizeof(int),
stream); stream);
if (csc_matrix->type == HL_FLOAT_VALUE) { if (csc_matrix->type == HL_FLOAT_VALUE) {
if (csc_val != NULL) { if (csc_val != NULL) {
CHECK_LE(csc_matrix->nnz, val_size) << "size not match!"; CHECK_LE(csc_matrix->nnz, val_size) << "size not match!";
hl_memcpy_async((void*)csc_val, hl_memcpy_async((void *)csc_val,
(void*)csc->csc_val, (void *)csc->csc_val,
(csc_matrix->nnz)*sizeof(real), (csc_matrix->nnz) * sizeof(real),
stream); stream);
} else { } else {
LOG(FATAL) << "parameter csr_val is null pointer!"; LOG(FATAL) << "parameter csr_val is null pointer!";
...@@ -1152,7 +1106,7 @@ void hl_memcpy_from_csr_matrix(real *csr_val, ...@@ -1152,7 +1106,7 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
CHECK_NOTNULL(csr_row); CHECK_NOTNULL(csr_row);
CHECK_NOTNULL(csr_col); CHECK_NOTNULL(csr_col);
CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR) CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
<< "csr_matrix is not csr format error!"; << "csr_matrix is not csr format error!";
if (csr_matrix->nnz > col_size || if (csr_matrix->nnz > col_size ||
csr_matrix->rows + 1 > static_cast<int>(row_size)) { csr_matrix->rows + 1 > static_cast<int>(row_size)) {
...@@ -1160,20 +1114,20 @@ void hl_memcpy_from_csr_matrix(real *csr_val, ...@@ -1160,20 +1114,20 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
} }
hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix); hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
hl_memcpy_async((void*)csr_row, hl_memcpy_async((void *)csr_row,
(void*)csr->csr_row, (void *)csr->csr_row,
(csr_matrix->rows+1)*sizeof(int), (csr_matrix->rows + 1) * sizeof(int),
stream); stream);
hl_memcpy_async((void*)csr_col, hl_memcpy_async((void *)csr_col,
(void*)csr->csr_col, (void *)csr->csr_col,
(csr_matrix->nnz)*sizeof(int), (csr_matrix->nnz) * sizeof(int),
stream); stream);
if (csr_matrix->type == HL_FLOAT_VALUE) { if (csr_matrix->type == HL_FLOAT_VALUE) {
if (csr_val != NULL) { if (csr_val != NULL) {
CHECK_LE(csr_matrix->nnz, val_size) << "size not match!"; CHECK_LE(csr_matrix->nnz, val_size) << "size not match!";
hl_memcpy_async((void*)csr_val, hl_memcpy_async((void *)csr_val,
(void*)csr->csr_val, (void *)csr->csr_val,
(csr_matrix->nnz)*sizeof(real), (csr_matrix->nnz) * sizeof(real),
stream); stream);
} else { } else {
LOG(FATAL) << "parameter csr_val is null pointer!"; LOG(FATAL) << "parameter csr_val is null pointer!";
...@@ -1181,8 +1135,8 @@ void hl_memcpy_from_csr_matrix(real *csr_val, ...@@ -1181,8 +1135,8 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
} }
} }
void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM, void hl_sparse_matrix_column_sum(
int dimN, real scale) { real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
if (B_d->format == HL_SPARSE_CSR) { if (B_d->format == HL_SPARSE_CSR) {
hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale); hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale);
} else { } else {
...@@ -1190,8 +1144,8 @@ void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM, ...@@ -1190,8 +1144,8 @@ void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM,
} }
} }
void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d, void hl_matrix_csr_column_sum(
int dimM, int dimN, real scale) { real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d); CHECK_NOTNULL(B_d);
...@@ -1216,8 +1170,7 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d, ...@@ -1216,8 +1170,7 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
CHECK_SYNC("hl_matrix_csr_column_sum failed"); CHECK_SYNC("hl_matrix_csr_column_sum failed");
} }
void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
real* B_d, real scale) {
if (A_d->format == HL_SPARSE_CSR) { if (A_d->format == HL_SPARSE_CSR) {
hl_matrix_csr_add_bias(A_d, B_d, scale); hl_matrix_csr_add_bias(A_d, B_d, scale);
} else { } else {
...@@ -1225,8 +1178,7 @@ void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, ...@@ -1225,8 +1178,7 @@ void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
} }
} }
void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d, void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
real scale) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d); CHECK_NOTNULL(B_d);
...@@ -1247,8 +1199,12 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d, ...@@ -1247,8 +1199,12 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
CHECK_SYNC("hl_sparse_matrix_add_bias failed"); CHECK_SYNC("hl_sparse_matrix_add_bias failed");
} }
void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM, void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
int dimN, real alpha, real beta) { real *B_d,
int dimM,
int dimN,
real alpha,
real beta) {
if (A_d->format == HL_SPARSE_CSR) { if (A_d->format == HL_SPARSE_CSR) {
hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta); hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta);
} else { } else {
...@@ -1256,8 +1212,12 @@ void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM, ...@@ -1256,8 +1212,12 @@ void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM,
} }
} }
void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM, void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
int dimN, real alpha, real beta) { real *B_d,
int dimM,
int dimN,
real alpha,
real beta) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d); CHECK_NOTNULL(B_d);
...@@ -1277,20 +1237,26 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM, ...@@ -1277,20 +1237,26 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
gridX = gridX > 0 ? gridX : 1; gridX = gridX > 0 ? gridX : 1;
dim3 block(512, 1); dim3 block(512, 1);
dim3 grid(gridX, dimM); dim3 grid(gridX, dimM);
KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>( KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, B_d, alpha, beta, dimM, dimN); A_d2->csr_row,
A_d2->csr_col,
B_d,
alpha,
beta,
dimM,
dimN);
CHECK_SYNC("hl_sparse_matrix_add_dense failed"); CHECK_SYNC("hl_sparse_matrix_add_dense failed");
} }
int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
__sparse_get_return__(sMat, row); __sparse_get_return__(sMat, row);
} }
int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
__sparse_get_return__(sMat, col); __sparse_get_return__(sMat, col);
} }
real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) { real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
__sparse_get_return__(sMat, val); __sparse_get_return__(sMat, val);
} }
...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cmath>
#include <stdlib.h> #include <stdlib.h>
#include "hl_cuda.h" #include <cmath>
#include "hl_time.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_cuda.h"
#include "hl_perturbation_util.cuh" #include "hl_perturbation_util.cuh"
#include "hl_time.h"
#define _USE_MATH_DEFINES #define _USE_MATH_DEFINES
...@@ -30,10 +29,16 @@ limitations under the License. */ ...@@ -30,10 +29,16 @@ limitations under the License. */
* centerX, centerY: translation. * centerX, centerY: translation.
* sourceX, sourceY: output coordinates in the original image. * sourceX, sourceY: output coordinates in the original image.
*/ */
__device__ void getTranformCoord(int x, int y, real theta, real scale, __device__ void getTranformCoord(int x,
real tgtCenter, real imgCenter, int y,
real centerR, real centerC, real theta,
int* sourceX, int* sourceY) { real scale,
real tgtCenter,
real imgCenter,
real centerR,
real centerC,
int* sourceX,
int* sourceY) {
real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)}; real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
// compute coornidates in the rotated and scaled image // compute coornidates in the rotated and scaled image
...@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale, ...@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
* created by Wei Xu (genome), converted by Jiang Wang * created by Wei Xu (genome), converted by Jiang Wang
*/ */
__global__ void kSamplingPatches(const real* imgs, real* targets, __global__ void kSamplingPatches(const real* imgs,
int imgSize, int tgtSize, const int channels, real* targets,
int samplingRate, const real* thetas, int imgSize,
const real* scales, const int* centerRs, int tgtSize,
const int* centerCs, const real padValue, const int channels,
int samplingRate,
const real* thetas,
const real* scales,
const int* centerRs,
const int* centerCs,
const real padValue,
const int numImages) { const int numImages) {
const int caseIdx = blockIdx.x * 4 + threadIdx.x; const int caseIdx = blockIdx.x * 4 + threadIdx.x;
const int pxIdx = blockIdx.y * 128 + threadIdx.y; const int pxIdx = blockIdx.y * 128 + threadIdx.y;
...@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, ...@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
const int pxY = pxIdx / tgtSize; const int pxY = pxIdx / tgtSize;
int srcPxX, srcPxY; int srcPxX, srcPxY;
getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter, getTranformCoord(pxX,
imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX, pxY,
thetas[imgIdx],
scales[imgIdx],
tgtCenter,
imgCenter,
centerCs[caseIdx],
centerRs[caseIdx],
&srcPxX,
&srcPxY); &srcPxY);
imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels; imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
...@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, ...@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
* *
* created by Wei Xu * created by Wei Xu
*/ */
void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, void hl_generate_disturb_params(real*& gpuAngle,
int*& gpuCenterR, int*& gpuCenterC, real*& gpuScaleRatio,
int numImages, int imgSize, real rotateAngle, int*& gpuCenterR,
real scaleRatio, int samplingRate, int*& gpuCenterC,
int numImages,
int imgSize,
real rotateAngle,
real scaleRatio,
int samplingRate,
bool isTrain) { bool isTrain) {
// The number of output samples. // The number of output samples.
int numPatches = numImages * samplingRate; int numPatches = numImages * samplingRate;
...@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
for (int i = 0; i < numImages; i++) { for (int i = 0; i < numImages; i++) {
r_angle[i] = r_angle[i] =
(rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT
- 0.5); -
0.5);
s_ratio[i] = s_ratio[i] =
1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT 1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT
} }
...@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
int pxY = int pxY =
(int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT
const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]), const real H[4] = {cos(-r_angle[i]),
sin(-r_angle[i]), cos(-r_angle[i])}; -sin(-r_angle[i]),
sin(-r_angle[i]),
cos(-r_angle[i])};
real x = pxX - imgCenter; real x = pxX - imgCenter;
real y = pxY - imgCenter; real y = pxY - imgCenter;
real xx = H[0] * x + H[1] * y; real xx = H[0] * x + H[1] * y;
...@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
delete[] center_c; delete[] center_c;
} }
void hl_conv_random_disturb_with_params(const real* images, int imgSize, void hl_conv_random_disturb_with_params(const real* images,
int tgtSize, int channels, int imgSize,
int numImages, int samplingRate, int tgtSize,
int channels,
int numImages,
int samplingRate,
const real* gpuRotationAngle, const real* gpuRotationAngle,
const real* gpuScaleRatio, const real* gpuScaleRatio,
const int* gpuCenterR, const int* gpuCenterR,
...@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize, ...@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
dim3 threadsPerBlock(4, 128); dim3 threadsPerBlock(4, 128);
dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128)); dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
kSamplingPatches <<<numBlocks, threadsPerBlock>>> kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
(images, target, imgSize, tgtSize, channels, samplingRate, target,
gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, imgSize,
paddingValue, numImages); tgtSize,
channels,
samplingRate,
gpuRotationAngle,
gpuScaleRatio,
gpuCenterR,
gpuCenterC,
paddingValue,
numImages);
hl_device_synchronize(); hl_device_synchronize();
} }
void hl_conv_random_disturb(const real* images, int imgSize, void hl_conv_random_disturb(const real* images,
int tgtSize, int channels, int numImages, int imgSize,
real scaleRatio, real rotateAngle, int tgtSize,
int samplingRate, real* gpu_r_angle, int channels,
real* gpu_s_ratio, int* gpu_center_r, int numImages,
int* gpu_center_c, int paddingValue, real scaleRatio,
bool isTrain, real* targets) { real rotateAngle,
int samplingRate,
real* gpu_r_angle,
real* gpu_s_ratio,
int* gpu_center_r,
int* gpu_center_c,
int paddingValue,
bool isTrain,
real* targets) {
// generate the random disturbance sequence and the sampling locations // generate the random disturbance sequence and the sampling locations
hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r, hl_generate_disturb_params(gpu_r_angle,
gpu_center_c, numImages, imgSize, rotateAngle, gpu_s_ratio,
scaleRatio, samplingRate, isTrain); gpu_center_r,
gpu_center_c,
hl_conv_random_disturb_with_params( numImages,
images, imgSize, tgtSize, channels, numImages, imgSize,
samplingRate, gpu_r_angle, gpu_s_ratio, rotateAngle,
gpu_center_r, gpu_center_r, paddingValue, scaleRatio,
targets); samplingRate,
isTrain);
hl_conv_random_disturb_with_params(images,
imgSize,
tgtSize,
channels,
numImages,
samplingRate,
gpu_r_angle,
gpu_s_ratio,
gpu_center_r,
gpu_center_r,
paddingValue,
targets);
} }
...@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_cuda.h" #include "hl_cuda.h"
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow> template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output, int ldo, __global__ void KeMatrixAddRows(real* output,
real* table, int ldt, int ldo,
real* table,
int ldt,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
...@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo, ...@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
while (idy < numSamples) { while (idy < numSamples) {
int tableId = ids[idy]; int tableId = ids[idy];
if ((0 <= tableId) && (tableId < tableSize)) { if ((0 <= tableId) && (tableId < tableSize)) {
real *out = output + idy * ldo; real* out = output + idy * ldo;
real *tab = table + tableId * ldt; real* tab = table + tableId * ldt;
for (int i = idx; i < dim; i += blockDimX) { for (int i = idx; i < dim; i += blockDimX) {
if (AddRow) { if (AddRow) {
paddle::paddleAtomicAdd(&tab[i], out[i]); paddle::paddleAtomicAdd(&tab[i], out[i]);
...@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo, ...@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
} }
} }
void hl_matrix_select_rows(real* output, int ldo, void hl_matrix_select_rows(real* output,
real* table, int ldt, int ldo,
real* table,
int ldt,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
...@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo, ...@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, ldo, table, ldt, ids, numSamples, tableSize, dim); output, ldo, table, ldt, ids, numSamples, tableSize, dim);
CHECK_SYNC("hl_matrix_select_rows failed"); CHECK_SYNC("hl_matrix_select_rows failed");
} }
void hl_matrix_add_to_rows(real* table, int ldt, void hl_matrix_add_to_rows(real* table,
real* input, int ldi, int ldt,
real* input,
int ldi,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
...@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt, ...@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, ldi, table, ldt, ids, numSamples, tableSize, dim); input, ldi, table, ldt, ids, numSamples, tableSize, dim);
CHECK_SYNC("hl_matrix_add_to_rows failed"); CHECK_SYNC("hl_matrix_add_to_rows failed");
} }
template<class T, int blockDimX, int gridDimX> template <class T, int blockDimX, int gridDimX>
__global__ void KeVectorSelect(T* dst, int sized, __global__ void KeVectorSelect(
const T* src, int sizes, T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
const int* ids, int sizei) {
int idx = threadIdx.x + blockDimX * blockIdx.x; int idx = threadIdx.x + blockDimX * blockIdx.x;
while (idx < sizei) { while (idx < sizei) {
int index = ids[idx]; int index = ids[idx];
...@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized, ...@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
} }
template <class T> template <class T>
void hl_vector_select_from(T* dst, int sized, void hl_vector_select_from(
const T* src, int sizes, T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
const int* ids, int sizei) {
CHECK_NOTNULL(dst); CHECK_NOTNULL(dst);
CHECK_NOTNULL(src); CHECK_NOTNULL(src);
CHECK_NOTNULL(ids); CHECK_NOTNULL(ids);
...@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized, ...@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
dim3 threads(512, 1); dim3 threads(512, 1);
dim3 grid(8, 1); dim3 grid(8, 1);
KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
(dst, sized, src, sizes, ids, sizei); dst, sized, src, sizes, ids, sizei);
CHECK_SYNC("hl_vector_select_from failed"); CHECK_SYNC("hl_vector_select_from failed");
} }
template template void hl_vector_select_from(real* dst,
void hl_vector_select_from(real* dst, int sized, int sized,
const real* src, int sizes, const real* src,
const int* ids, int sizei); int sizes,
template const int* ids,
void hl_vector_select_from(int* dst, int sized, int sizei);
const int* src, int sizes, template void hl_vector_select_from(
const int* ids, int sizei); int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
...@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
#include "hl_top_k.h"
#include "hl_sparse.ph" #include "hl_sparse.ph"
#include "hl_top_k.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
// using namespace hppl; // using namespace hppl;
struct Pair { struct Pair {
__device__ __forceinline__ __device__ __forceinline__ Pair() {}
Pair() {}
__device__ __forceinline__ __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
Pair(real value, int id) : v_(value), id_(id) {}
__device__ __forceinline__ __device__ __forceinline__ void set(real value, int id) {
void set(real value, int id) {
v_ = value; v_ = value;
id_ = id; id_ = id;
} }
__device__ __forceinline__ __device__ __forceinline__ void operator=(const Pair& in) {
void operator=(const Pair& in) {
v_ = in.v_; v_ = in.v_;
id_ = in.id_; id_ = in.id_;
} }
__device__ __forceinline__ __device__ __forceinline__ bool operator<(const real value) const {
bool operator<(const real value) const {
return (v_ < value); return (v_ < value);
} }
__device__ __forceinline__ __device__ __forceinline__ bool operator<(const Pair& in) const {
bool operator<(const Pair& in) const {
return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_)); return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
} }
__device__ __forceinline__ __device__ __forceinline__ bool operator>(const Pair& in) const {
bool operator>(const Pair& in) const {
return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_)); return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
} }
...@@ -58,8 +50,9 @@ struct Pair { ...@@ -58,8 +50,9 @@ struct Pair {
int id_; int id_;
}; };
__device__ __forceinline__ __device__ __forceinline__ void addTo(Pair topK[],
void addTo(Pair topK[], const Pair &p, int beamSize) { const Pair& p,
int beamSize) {
for (int k = beamSize - 2; k >= 0; k--) { for (int k = beamSize - 2; k >= 0; k--) {
if (topK[k] < p) { if (topK[k] < p) {
topK[k + 1] = topK[k]; topK[k + 1] = topK[k];
...@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) { ...@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
topK[0] = p; topK[0] = p;
} }
template<int beamSize> template <int beamSize>
__device__ __forceinline__ __device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
void addTo(Pair topK[], const Pair &p) {
for (int k = beamSize - 2; k >= 0; k--) { for (int k = beamSize - 2; k >= 0; k--) {
if (topK[k] < p) { if (topK[k] < p) {
topK[k + 1] = topK[k]; topK[k + 1] = topK[k];
...@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) { ...@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
topK[0] = p; topK[0] = p;
} }
template<int blockSize> template <int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void getTopK(
void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) { Pair topK[], real* src, int idx, int dim, int beamSize) {
while (idx < dim) { while (idx < dim) {
if (topK[beamSize - 1] < src[idx]) { if (topK[beamSize - 1] < src[idx]) {
Pair tmp(src[idx], idx); Pair tmp(src[idx], idx);
...@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) { ...@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
} }
} }
template<int blockSize> template <int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void getTopK(
void getTopK(Pair topK[], real *src, int idx, int dim, Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
const Pair& max, int beamSize) {
while (idx < dim) { while (idx < dim) {
if (topK[beamSize - 1] < src[idx]) { if (topK[beamSize - 1] < src[idx]) {
Pair tmp(src[idx], idx); Pair tmp(src[idx], idx);
...@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, ...@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
} }
} }
template<int blockSize> template <int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void getTopK(
void getTopK(Pair topK[], real *val, int *col, Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
int idx, int dim, int beamSize) {
while (idx < dim) { while (idx < dim) {
if (topK[beamSize - 1] < val[idx]) { if (topK[beamSize - 1] < val[idx]) {
Pair tmp(val[idx], col[idx]); Pair tmp(val[idx], col[idx]);
...@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col, ...@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
} }
} }
template<int blockSize> template <int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void getTopK(Pair topK[],
void getTopK(Pair topK[], real *val, int *col, int idx, int dim, real* val,
const Pair& max, int beamSize) { int* col,
int idx,
int dim,
const Pair& max,
int beamSize) {
while (idx < dim) { while (idx < dim) {
if (topK[beamSize - 1] < val[idx]) { if (topK[beamSize - 1] < val[idx]) {
Pair tmp(val[idx], col[idx]); Pair tmp(val[idx], col[idx]);
...@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim, ...@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
} }
} }
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void threadGetTopK(Pair topK[],
void threadGetTopK(Pair topK[], int& beam, int beamSize, int& beam,
real* src, int beamSize,
bool& firstStep, bool& isEmpty, Pair& max, real* src,
int dim, const int tid) { bool& firstStep,
bool& isEmpty,
Pair& max,
int dim,
const int tid) {
if (beam > 0) { if (beam > 0) {
int length = beam < beamSize ? beam : beamSize; int length = beam < beamSize ? beam : beamSize;
if (firstStep) { if (firstStep) {
...@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, ...@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
} }
} }
if (!isEmpty) { if (!isEmpty) {
getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
max, length);
} }
} }
...@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, ...@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
} }
} }
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void threadGetTopK(Pair topK[],
void threadGetTopK(Pair topK[], int& beam, int beamSize, int& beam,
real* val, int* col, int beamSize,
bool& firstStep, bool& isEmpty, Pair& max, real* val,
int dim, const int tid) { int* col,
bool& firstStep,
bool& isEmpty,
Pair& max,
int dim,
const int tid) {
if (beam > 0) { if (beam > 0) {
int length = beam < beamSize ? beam : beamSize; int length = beam < beamSize ? beam : beamSize;
if (firstStep) { if (firstStep) {
...@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, ...@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
} }
} }
if (!isEmpty) { if (!isEmpty) {
getTopK<blockSize>(topK + maxLength - beam, val, col, tid, dim, getTopK<blockSize>(
max, length); topK + maxLength - beam, val, col, tid, dim, max, length);
} }
} }
...@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, ...@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
} }
} }
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void blockReduce(Pair* shTopK,
void blockReduce(Pair* shTopK, int* maxId, Pair topK[], int* maxId,
real** topVal, int** topIds, Pair topK[],
int& beam, int& beamSize, real** topVal,
const int tid, const int warp) { int** topIds,
int& beam,
int& beamSize,
const int tid,
const int warp) {
while (true) { while (true) {
__syncthreads(); __syncthreads();
if (tid < blockSize / 2) { if (tid < blockSize / 2) {
...@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[], ...@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
} }
} }
__syncthreads(); __syncthreads();
for (int stride = blockSize / 4; stride > 0; stride = stride/2) { for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) { if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
maxId[tid] = maxId[tid + stride]; maxId[tid] = maxId[tid + stride];
...@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[], ...@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
* 3. go to the second setp, until one thread's topK value is null; * 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value. * 4. go to the first setp, until get the topK value.
*/ */
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__global__ void KeMatrixTopK(real* topVal, int ldv, __global__ void KeMatrixTopK(real* topVal,
int * topIds, int ldv,
real* src, int lds, int* topIds,
real* src,
int lds,
int dim, int dim,
int beamSize) { int beamSize) {
__shared__ Pair shTopK[blockSize]; __shared__ Pair shTopK[blockSize];
...@@ -271,7 +279,7 @@ __global__ void KeMatrixTopK(real* topVal, int ldv, ...@@ -271,7 +279,7 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
topVal += blockIdx.x * ldv; topVal += blockIdx.x * ldv;
topIds += blockIdx.x * beamSize; topIds += blockIdx.x * beamSize;
Pair topK[maxLength]; // NOLINT Pair topK[maxLength]; // NOLINT
int beam = maxLength; int beam = maxLength;
Pair max; Pair max;
bool isEmpty = false; bool isEmpty = false;
...@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv, ...@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
topK[k].set(-HL_FLOAT_MAX, -1); topK[k].set(-HL_FLOAT_MAX, -1);
} }
while (beamSize) { while (beamSize) {
threadGetTopK<maxLength, blockSize> threadGetTopK<maxLength, blockSize>(
(topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0]; shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize> blockReduce<maxLength, blockSize>(
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
} }
} }
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__global__ void KeSMatrixTopK(real* topVal, int ldv, __global__ void KeSMatrixTopK(real* topVal,
int * topIds, int ldv,
int* topIds,
real* val, real* val,
int* row, int* row,
int* col, int* col,
...@@ -304,7 +313,7 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv, ...@@ -304,7 +313,7 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
topVal += blockIdx.x * ldv; topVal += blockIdx.x * ldv;
topIds += blockIdx.x * beamSize; topIds += blockIdx.x * beamSize;
Pair topK[maxLength]; // NOLINT Pair topK[maxLength]; // NOLINT
int beam = maxLength; int beam = maxLength;
Pair max; Pair max;
bool isEmpty = false; bool isEmpty = false;
...@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv, ...@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
topK[k].set(-HL_FLOAT_MAX, -1); topK[k].set(-HL_FLOAT_MAX, -1);
} }
while (beamSize) { while (beamSize) {
threadGetTopK<maxLength, blockSize> threadGetTopK<maxLength, blockSize>(
(topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid); topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0]; shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize> blockReduce<maxLength, blockSize>(
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
} }
} }
void hl_matrix_top_k(real* topVal, int ldv, void hl_matrix_top_k(real* topVal,
int * topIds, int ldv,
real* src, int lds, int* topIds,
real* src,
int lds,
int dim, int dim,
int beamSize, int beamSize,
int numSamples) { int numSamples) {
...@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv, ...@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
dim3 threads(256, 1); dim3 threads(256, 1);
dim3 grid(numSamples, 1); dim3 grid(numSamples, 1);
KeMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
(topVal, ldv, topIds, src, lds, dim, beamSize); topVal, ldv, topIds, src, lds, dim, beamSize);
CHECK_SYNC("hl_matrix_top_k failed"); CHECK_SYNC("hl_matrix_top_k failed");
} }
void hl_sparse_matrix_top_k(real* topVal, int ldv, void hl_sparse_matrix_top_k(real* topVal,
int * topIds, int ldv,
int* topIds,
hl_sparse_matrix_s src, hl_sparse_matrix_s src,
int beamSize, int beamSize,
int numSamples) { int numSamples) {
CHECK_NOTNULL(topVal); CHECK_NOTNULL(topVal);
CHECK_NOTNULL(topIds); CHECK_NOTNULL(topIds);
CHECK_NOTNULL(src); CHECK_NOTNULL(src);
CHECK_EQ(src->format, HL_SPARSE_CSR) CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
<<"sparse matrix format error!";
hl_csr_matrix csr = (hl_csr_matrix)src->matrix; hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
if (csr->csr_val == NULL || csr->csr_row == NULL || if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
csr->csr_col == NULL) {
LOG(FATAL) << "parameter src is null!"; LOG(FATAL) << "parameter src is null!";
} }
dim3 threads(256, 1); dim3 threads(256, 1);
dim3 grid(numSamples, 1); dim3 grid(numSamples, 1);
KeSMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>> KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
(topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize); topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
CHECK_SYNC("hl_sparse_matrix_top_k failed"); CHECK_SYNC("hl_sparse_matrix_top_k failed");
} }
...@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv, ...@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
* 3. go to the second setp, until one thread's topK value is null; * 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value. * 4. go to the first setp, until get the topK value.
*/ */
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, __global__ void KeMatrixTopKClassificationError(real* topVal,
int * topIds, int ldv,
real* src, int lds, int* topIds,
real* src,
int lds,
int dim, int dim,
int beamSize, int beamSize,
int* label, int* label,
...@@ -408,7 +420,7 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, ...@@ -408,7 +420,7 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
topVal += blockIdx.x * ldv; topVal += blockIdx.x * ldv;
topIds += blockIdx.x * beamSize; topIds += blockIdx.x * beamSize;
Pair topK[maxLength]; // NOLINT Pair topK[maxLength]; // NOLINT
int beam = maxLength; int beam = maxLength;
Pair max; Pair max;
bool isEmpty = false; bool isEmpty = false;
...@@ -420,34 +432,36 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, ...@@ -420,34 +432,36 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
} }
while (beamSize) { while (beamSize) {
threadGetTopK<maxLength, blockSize> threadGetTopK<maxLength, blockSize>(
(topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0]; shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize> blockReduce<maxLength, blockSize>(
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
} }
__syncthreads(); __syncthreads();
if (tid == 0) { if (tid == 0) {
for (int i = 0; i < topkSize; i++) { for (int i = 0; i < topkSize; i++) {
if (*--topIds == label[blockIdx.x]) { if (*--topIds == label[blockIdx.x]) {
recResult[blockIdx.x] = 0; recResult[blockIdx.x] = 0;
break; break;
} }
recResult[blockIdx.x] = 1.0f; recResult[blockIdx.x] = 1.0f;
} }
} }
} }
void hl_matrix_classification_error(real* topVal, int ldv, void hl_matrix_classification_error(real* topVal,
int* topIds, int ldv,
real* src, int lds, int* topIds,
int dim, real* src,
int topkSize, int lds,
int numSamples, int dim,
int* label, int topkSize,
real* recResult) { int numSamples,
int* label,
real* recResult) {
CHECK_NOTNULL(topVal); CHECK_NOTNULL(topVal);
CHECK_NOTNULL(topIds); CHECK_NOTNULL(topIds);
CHECK_NOTNULL(src); CHECK_NOTNULL(src);
...@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv, ...@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
dim3 threads(256, 1); dim3 threads(256, 1);
dim3 grid(numSamples, 1); dim3 grid(numSamples, 1);
KeMatrixTopKClassificationError<5, 256> KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<< grid, threads, 0, STREAM_DEFAULT >>> topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
(topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
CHECK_SYNC("hl_matrix_top_k classification error failed"); CHECK_SYNC("hl_matrix_top_k classification error failed");
} }
...@@ -12,13 +12,15 @@ cc_test(variable_test SRCS variable_test.cc) ...@@ -12,13 +12,15 @@ cc_test(variable_test SRCS variable_test.cc)
cc_library(scope SRCS scope.cc) cc_library(scope SRCS scope.cc)
cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope)
proto_library(attr_type SRCS attr_type.proto) proto_library(attribute_proto SRCS attribute.proto)
proto_library(op_proto SRCS op_proto.proto DEPS attr_type) proto_library(op_proto SRCS op_proto.proto DEPS attribute_proto)
proto_library(op_desc SRCS op_desc.proto DEPS attr_type) proto_library(op_desc SRCS op_desc.proto DEPS attribute_proto)
cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf) cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf) cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope) cc_library(attribute SRCS attribute.cc DEPS op_desc op_proto)
cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope attribute)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator) cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator)
...@@ -26,7 +28,7 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder) ...@@ -26,7 +28,7 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder)
cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op) cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto) py_proto_compile(framework_py_proto SRCS attribute.proto op_proto.proto op_desc.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module. # Generate an empty __init__.py to make framework_py_proto as a valid python module.
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init) add_dependencies(framework_py_proto framework_py_proto_init)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/attribute.h"
#include <vector>
namespace paddle {
namespace framework {
template <>
AttrType AttrTypeID<int>() {
return INT;
}
template <>
AttrType AttrTypeID<float>() {
return FLOAT;
}
template <>
AttrType AttrTypeID<std::string>() {
return STRING;
}
template <>
AttrType AttrTypeID<std::vector<int>>() {
return INTS;
}
template <>
AttrType AttrTypeID<std::vector<float>>() {
return FLOATS;
}
template <>
AttrType AttrTypeID<std::vector<std::string>>() {
return STRINGS;
}
Attribute GetAttrValue(const AttrDesc& attr_desc) {
switch (attr_desc.type()) {
case paddle::framework::AttrType::INT: {
return attr_desc.i();
}
case paddle::framework::AttrType::FLOAT: {
return attr_desc.f();
}
case paddle::framework::AttrType::STRING: {
return attr_desc.s();
}
case paddle::framework::AttrType::INTS: {
std::vector<int> val(attr_desc.ints_size());
for (int i = 0; i < attr_desc.ints_size(); ++i) {
val[i] = attr_desc.ints(i);
}
return val;
}
case paddle::framework::AttrType::FLOATS: {
std::vector<float> val(attr_desc.floats_size());
for (int i = 0; i < attr_desc.floats_size(); ++i) {
val[i] = attr_desc.floats(i);
}
return val;
}
case paddle::framework::AttrType::STRINGS: {
std::vector<std::string> val(attr_desc.strings_size());
for (int i = 0; i < attr_desc.strings_size(); ++i) {
val[i] = attr_desc.strings(i);
}
return val;
}
}
PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
return boost::blank();
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once #pragma once
#include <boost/variant.hpp> #include <boost/variant.hpp>
...@@ -6,6 +20,9 @@ ...@@ -6,6 +20,9 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/framework/attribute.pb.h"
#include "paddle/framework/op_desc.pb.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -14,13 +31,19 @@ namespace framework { ...@@ -14,13 +31,19 @@ namespace framework {
typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>, typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
std::vector<float>, std::vector<std::string>> std::vector<float>, std::vector<std::string>>
Attribute; Attribute;
typedef std::unordered_map<std::string, Attribute> AttributeMap; typedef std::unordered_map<std::string, Attribute> AttributeMap;
template <typename T>
AttrType AttrTypeID();
Attribute GetAttrValue(const AttrDesc& attr_desc);
// check whether a value(attribute) fit a certain limit // check whether a value(attribute) fit a certain limit
template <typename T> template <typename T>
class LargerThanChecker { class LargerThanChecker {
public: public:
LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {} explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const { void operator()(T& value) const {
PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail"); PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
} }
...@@ -35,7 +58,8 @@ class LargerThanChecker { ...@@ -35,7 +58,8 @@ class LargerThanChecker {
template <typename T> template <typename T>
class DefaultValueSetter { class DefaultValueSetter {
public: public:
DefaultValueSetter(T default_value) : default_value_(default_value) {} explicit DefaultValueSetter(T default_value)
: default_value_(default_value) {}
void operator()(T& value) const { value = default_value_; } void operator()(T& value) const { value = default_value_; }
private: private:
...@@ -78,7 +102,8 @@ class TypedAttrChecker { ...@@ -78,7 +102,8 @@ class TypedAttrChecker {
typedef std::function<void(T&)> ValueChecker; typedef std::function<void(T&)> ValueChecker;
public: public:
TypedAttrChecker(const std::string& attr_name) : attr_name_(attr_name) {} explicit TypedAttrChecker(const std::string& attr_name)
: attr_name_(attr_name) {}
TypedAttrChecker& InEnum(const std::unordered_set<T>& range) { TypedAttrChecker& InEnum(const std::unordered_set<T>& range) {
value_checkers_.push_back(EnumInContainer<T>(range)); value_checkers_.push_back(EnumInContainer<T>(range));
......
...@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
// Attribute Type for paddle's Op. // Attribute Type for paddle's Op.
// Op contains many attributes. Each type of attributes could be different. // Op contains many attributes. Each type of attributes could be different.
// The AttrType will be shared between AttrDesc and AttrProto. // The AttrType will be shared between AttrDesc and AttrProto.
enum AttrType { enum AttrType {
INT = 0; INT = 0;
FLOAT = 1; FLOAT = 1;
STRING = 2; STRING = 2;
INTS = 3; INTS = 3;
FLOATS = 4; FLOATS = 4;
STRINGS = 5; STRINGS = 5;
} }
\ No newline at end of file
...@@ -59,19 +59,17 @@ std::shared_ptr<OperatorBase> BackwardRecursive( ...@@ -59,19 +59,17 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
// If all input gradients of forwarding operator do not need to calculate, // If all input gradients of forwarding operator do not need to calculate,
// just return an NOP. Not return null ptr because NOP does not take // just return an NOP. Not return null ptr because NOP does not take
// too much time for calculation, but it is useful for simplifying logic. // too much time for calculation, but it is useful for simplifying logic.
if (AllInSet(forwardOp.inputs_, OperatorBase::GRAD_VAR_SUFFIX(), if (AllInSet(forwardOp.inputs_, kGradVarSuffix, no_grad_names)) {
no_grad_names)) {
return NOP(); return NOP();
} }
// All output gradients of forwarding operator do not need to calculate. // All output gradients of forwarding operator do not need to calculate.
// Then all input gradients cannot be computed at all, and we put them into // Then all input gradients cannot be computed at all, and we put them into
// `no_grad_names` set. Return an NOP. // `no_grad_names` set. Return an NOP.
if (AllInSet(forwardOp.outputs_, OperatorBase::GRAD_VAR_SUFFIX(), if (AllInSet(forwardOp.outputs_, kGradVarSuffix, no_grad_names)) {
no_grad_names)) {
for (auto& name : forwardOp.inputs_) { for (auto& name : forwardOp.inputs_) {
// Mark all input is not need // Mark all input is not need
no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX()); no_grad_names.insert(name + kGradVarSuffix);
} }
return NOP(); return NOP();
} }
...@@ -134,9 +132,9 @@ std::shared_ptr<OperatorBase> BackwardRecursive( ...@@ -134,9 +132,9 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp); std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
for (std::string& grad_input : grad_op->inputs_) { for (std::string& grad_input : grad_op->inputs_) {
if (no_grad_names.count(grad_input)) { if (no_grad_names.count(grad_input)) {
std::string prefix = grad_input.substr( std::string prefix =
0, grad_input.size() - OperatorBase::GRAD_VAR_SUFFIX().size()); grad_input.substr(0, grad_input.size() - kGradVarSuffix.size());
grad_input = prefix + OperatorBase::ZERO_VAR_SUFFIX(); grad_input = prefix + kZeroVarSuffix;
// If part of input gradient of that operator is not calculated, fill // If part of input gradient of that operator is not calculated, fill
// zero variables to that input gradient. // zero variables to that input gradient.
...@@ -147,7 +145,7 @@ std::shared_ptr<OperatorBase> BackwardRecursive( ...@@ -147,7 +145,7 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
for (std::string& grad_output : grad_op->outputs_) { for (std::string& grad_output : grad_op->outputs_) {
if (no_grad_names.count(grad_output)) { if (no_grad_names.count(grad_output)) {
grad_output = OperatorBase::EMPTY_VAR_NAME(); grad_output = kEmptyVarName;
} }
} }
...@@ -168,14 +166,14 @@ std::shared_ptr<OperatorBase> Backward( ...@@ -168,14 +166,14 @@ std::shared_ptr<OperatorBase> Backward(
std::unordered_set<std::string> no_grad_names; std::unordered_set<std::string> no_grad_names;
no_grad_names.reserve(no_grad_vars.size()); no_grad_names.reserve(no_grad_vars.size());
no_grad_names.insert(OperatorBase::EMPTY_VAR_NAME() + no_grad_names.insert(kEmptyVarName + kGradVarSuffix);
OperatorBase::GRAD_VAR_SUFFIX());
for (auto& name : no_grad_vars) { for (auto& name : no_grad_vars) {
no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX()); no_grad_names.insert(name + kGradVarSuffix);
} }
size_t uid = 0; size_t uid = 0;
return BackwardRecursive(forwardOp, no_grad_names, uid); return BackwardRecursive(forwardOp, no_grad_names, uid);
} }
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -78,14 +78,14 @@ class FcOp : public ops::NetOp { ...@@ -78,14 +78,14 @@ class FcOp : public ops::NetOp {
{Output("mul_result")}, {})); {Output("mul_result")}, {}));
auto b_name = Input("b"); auto b_name = Input("b");
std::string before_act = "mul_result"; std::string before_act = "mul_result";
if (b_name != EMPTY_VAR_NAME()) { if (b_name != kEmptyVarName) {
AddOp(OpRegistry::CreateOp("rowwise_add", {Output("mul_result"), b_name}, AddOp(OpRegistry::CreateOp("rowwise_add", {Output("mul_result"), b_name},
{Output("add_result")}, {})); {Output("add_result")}, {}));
before_act = "add_result"; before_act = "add_result";
} else { } else {
auto out_varname = Output("add_result"); auto out_varname = Output("add_result");
if (out_varname != EMPTY_VAR_NAME()) { if (out_varname != kEmptyVarName) {
this->Rename(out_varname, EMPTY_VAR_NAME()); this->Rename(out_varname, kEmptyVarName);
} }
} }
...@@ -163,13 +163,12 @@ TEST(Backward, simple_op_grad) { ...@@ -163,13 +163,12 @@ TEST(Backward, simple_op_grad) {
ASSERT_NE(fwd, nullptr); ASSERT_NE(fwd, nullptr);
auto gop = f::OpRegistry::CreateGradOp(*fwd); auto gop = f::OpRegistry::CreateGradOp(*fwd);
ASSERT_EQ(4UL, gop->inputs_.size()); ASSERT_EQ(4UL, gop->inputs_.size());
ASSERT_EQ(f::OperatorBase::EMPTY_VAR_NAME(), gop->inputs_[0]); ASSERT_EQ(f::kEmptyVarName, gop->inputs_[0]);
ASSERT_EQ("rowwise_add_grad", gop->type_); ASSERT_EQ("rowwise_add_grad", gop->type_);
ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->outputs_[0]); ASSERT_EQ("X" + f::kGradVarSuffix, gop->outputs_[0]);
ASSERT_EQ("b" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->outputs_[1]); ASSERT_EQ("b" + f::kGradVarSuffix, gop->outputs_[1]);
ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(), ASSERT_EQ("X" + f::kGradVarSuffix, gop->Output("X" + f::kGradVarSuffix));
gop->Output("X" + f::OperatorBase::GRAD_VAR_SUFFIX()));
} }
TEST(Backward, simple_op_not_need_grad) { TEST(Backward, simple_op_not_need_grad) {
...@@ -177,7 +176,7 @@ TEST(Backward, simple_op_not_need_grad) { ...@@ -177,7 +176,7 @@ TEST(Backward, simple_op_not_need_grad) {
ASSERT_NE(fwd, nullptr); ASSERT_NE(fwd, nullptr);
auto gop = f::Backward(*fwd, {"X"}); auto gop = f::Backward(*fwd, {"X"});
ASSERT_EQ(std::find(gop->outputs_.begin(), gop->outputs_.end(), ASSERT_EQ(std::find(gop->outputs_.begin(), gop->outputs_.end(),
"X" + f::OperatorBase::GRAD_VAR_SUFFIX()), "X" + f::kGradVarSuffix),
gop->outputs_.end()); gop->outputs_.end());
auto no_input_gop = f::Backward(*fwd, {"X", "b"}); auto no_input_gop = f::Backward(*fwd, {"X", "b"});
...@@ -210,9 +209,9 @@ TEST(Backward, net_fc_backward_normal) { ...@@ -210,9 +209,9 @@ TEST(Backward, net_fc_backward_normal) {
} }
TEST(Backward, net_fc_backward_not_have_b) { TEST(Backward, net_fc_backward_not_have_b) {
std::shared_ptr<f::OperatorBase> fwd = f::OpRegistry::CreateOp( std::shared_ptr<f::OperatorBase> fwd =
"fc", {"X", "w", f::OperatorBase::EMPTY_VAR_NAME()}, f::OpRegistry::CreateOp("fc", {"X", "w", f::kEmptyVarName},
{"mul_result", "add_result", "tmp"}, {}); {"mul_result", "add_result", "tmp"}, {});
ASSERT_NE(fwd, nullptr); ASSERT_NE(fwd, nullptr);
std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {}); std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
ASSERT_TRUE(gop->IsNetOp()); ASSERT_TRUE(gop->IsNetOp());
...@@ -242,24 +241,21 @@ TEST(Backward, net_input_of_network_not_need_grad) { ...@@ -242,24 +241,21 @@ TEST(Backward, net_input_of_network_not_need_grad) {
std::unordered_set<std::string> all_output = std::unordered_set<std::string>( std::unordered_set<std::string> all_output = std::unordered_set<std::string>(
bwd_net->outputs_.begin(), bwd_net->outputs_.end()); bwd_net->outputs_.begin(), bwd_net->outputs_.end());
all_output.erase(f::OperatorBase::EMPTY_VAR_NAME()); all_output.erase(f::kEmptyVarName);
for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) { for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
ASSERT_NE(all_output.find(out + f::OperatorBase::GRAD_VAR_SUFFIX()), ASSERT_NE(all_output.find(out + f::kGradVarSuffix), all_output.end());
all_output.end());
} }
// Not Generated X // Not Generated X
ASSERT_EQ(all_output.find("X" + f::OperatorBase::GRAD_VAR_SUFFIX()), ASSERT_EQ(all_output.find("X" + f::kGradVarSuffix), all_output.end());
all_output.end());
ASSERT_EQ(2UL, bwd_net->ops_.size()); ASSERT_EQ(2UL, bwd_net->ops_.size());
ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp()); ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get()); auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
ASSERT_EQ(3UL, first_fc_grad->ops_.size()); ASSERT_EQ(3UL, first_fc_grad->ops_.size());
ASSERT_EQ( ASSERT_EQ(f::kEmptyVarName,
f::OperatorBase::EMPTY_VAR_NAME(), first_fc_grad->ops_[2]->Output("A" + f::kGradVarSuffix));
first_fc_grad->ops_[2]->Output("A" + f::OperatorBase::GRAD_VAR_SUFFIX()));
} }
TEST(Backward, net_shared_weight) { TEST(Backward, net_shared_weight) {
...@@ -311,17 +307,15 @@ TEST(Backward, op_part_of_output_are_not_need) { ...@@ -311,17 +307,15 @@ TEST(Backward, op_part_of_output_are_not_need) {
ASSERT_EQ(1UL, fill_zero.inputs_.size()); ASSERT_EQ(1UL, fill_zero.inputs_.size());
ASSERT_EQ("Z", fill_zero.inputs_[0]); ASSERT_EQ("Z", fill_zero.inputs_[0]);
ASSERT_EQ(1UL, fill_zero.outputs_.size()); ASSERT_EQ(1UL, fill_zero.outputs_.size());
ASSERT_EQ("Z" + f::OperatorBase::ZERO_VAR_SUFFIX(), fill_zero.outputs_[0]); ASSERT_EQ("Z" + f::kZeroVarSuffix, fill_zero.outputs_[0]);
auto &d_many_out = *net->ops_[1]; auto &d_many_out = *net->ops_[1];
ASSERT_EQ("many_output_op_grad", d_many_out.type_); ASSERT_EQ("many_output_op_grad", d_many_out.type_);
ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.inputs_.size()); // I/O/OG ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.inputs_.size()); // I/O/OG
ASSERT_EQ("Z" + f::OperatorBase::ZERO_VAR_SUFFIX(), ASSERT_EQ("Z" + f::kZeroVarSuffix, d_many_out.Input("z" + f::kGradVarSuffix));
d_many_out.Input("z" + f::OperatorBase::GRAD_VAR_SUFFIX())); ASSERT_EQ("Y" + f::kGradVarSuffix, d_many_out.Input("y" + f::kGradVarSuffix));
ASSERT_EQ("Y" + f::OperatorBase::GRAD_VAR_SUFFIX(), ASSERT_EQ("X" + f::kGradVarSuffix,
d_many_out.Input("y" + f::OperatorBase::GRAD_VAR_SUFFIX())); d_many_out.Output("x" + f::kGradVarSuffix));
ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(),
d_many_out.Output("x" + f::OperatorBase::GRAD_VAR_SUFFIX()));
} }
TEST(Backward, op_part_of_input_are_not_need) { TEST(Backward, op_part_of_input_are_not_need) {
...@@ -331,12 +325,10 @@ TEST(Backward, op_part_of_input_are_not_need) { ...@@ -331,12 +325,10 @@ TEST(Backward, op_part_of_input_are_not_need) {
ASSERT_EQ(grad_mul.type_, "mul_grad"); ASSERT_EQ(grad_mul.type_, "mul_grad");
ASSERT_EQ(grad_mul.inputs_.size(), 2UL + 1UL + 1UL); ASSERT_EQ(grad_mul.inputs_.size(), 2UL + 1UL + 1UL);
ASSERT_EQ(grad_mul.outputs_.size(), 2UL); ASSERT_EQ(grad_mul.outputs_.size(), 2UL);
ASSERT_EQ(grad_mul.Output("A" + f::OperatorBase::GRAD_VAR_SUFFIX()), ASSERT_EQ(grad_mul.Output("A" + f::kGradVarSuffix), f::kEmptyVarName);
f::OperatorBase::EMPTY_VAR_NAME()); ASSERT_EQ(grad_mul.Output("B" + f::kGradVarSuffix), "b" + f::kGradVarSuffix);
ASSERT_EQ(grad_mul.Output("B" + f::OperatorBase::GRAD_VAR_SUFFIX()), ASSERT_EQ(grad_mul.Input("Out" + f::kGradVarSuffix),
"b" + f::OperatorBase::GRAD_VAR_SUFFIX()); "out" + f::kGradVarSuffix);
ASSERT_EQ(grad_mul.Input("Out" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"out" + f::OperatorBase::GRAD_VAR_SUFFIX());
ASSERT_EQ(grad_mul.Input("A"), "a"); ASSERT_EQ(grad_mul.Input("A"), "a");
ASSERT_EQ(grad_mul.Input("B"), "b"); ASSERT_EQ(grad_mul.Input("B"), "b");
ASSERT_EQ(grad_mul.Input("Out"), "out"); ASSERT_EQ(grad_mul.Input("Out"), "out");
...@@ -368,23 +360,4 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) { ...@@ -368,23 +360,4 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
EXPECT_EQ(bwd_net->ops_[1]->outputs_.size(), 0UL); EXPECT_EQ(bwd_net->ops_[1]->outputs_.size(), 0UL);
EXPECT_EQ(bwd_net->ops_[2]->inputs_.size(), 0UL); EXPECT_EQ(bwd_net->ops_[2]->inputs_.size(), 0UL);
EXPECT_EQ(bwd_net->ops_[2]->outputs_.size(), 0UL); EXPECT_EQ(bwd_net->ops_[2]->outputs_.size(), 0UL);
/*
EXPECT_EQ(grad_fc.Output("X" + f::OperatorBase::GRAD_VAR_SUFFIX()),
f::OperatorBase::EMPTY_VAR_NAME());
EXPECT_EQ(grad_fc.Output("W" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"w3" + f::OperatorBase::GRAD_VAR_SUFFIX());
EXPECT_EQ(grad_fc.Output("b" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"b3" + f::OperatorBase::GRAD_VAR_SUFFIX());
EXPECT_EQ(grad_fc.Output("mul_result" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"mul_out3" + f::OperatorBase::GRAD_VAR_SUFFIX());
EXPECT_EQ(grad_fc.Input("Out" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"out3" + f::OperatorBase::GRAD_VAR_SUFFIX());
EXPECT_EQ(grad_fc.Input("X"), "out2");
EXPECT_EQ(grad_fc.Input("W"), "w3");
EXPECT_EQ(grad_fc.Input("mul_result"), "mul_out3");
EXPECT_EQ(grad_fc.Input("add_result"), "tmp_out3");
EXPECT_EQ(grad_fc.Input("Out"), "out3");
*/
} }
...@@ -56,8 +56,7 @@ static void TransOpArg(const OperatorBase* src_op, OperatorBase* dst_op, ...@@ -56,8 +56,7 @@ static void TransOpArg(const OperatorBase* src_op, OperatorBase* dst_op,
for (const auto& arg : src_arg_list) { for (const auto& arg : src_arg_list) {
std::string src_name = arg.name(); std::string src_name = arg.name();
std::string dst_name = std::string dst_name = is_grad ? src_name + kGradVarSuffix : src_name;
is_grad ? src_name + OperatorBase::GRAD_VAR_SUFFIX() : src_name;
(*dst_op->in_out_idxs_)[dst_name] = idx++; (*dst_op->in_out_idxs_)[dst_name] = idx++;
int src_arg_idx = src_op->in_out_idxs_->at(src_name); int src_arg_idx = src_op->in_out_idxs_->at(src_name);
int src_begin = int src_begin =
...@@ -65,10 +64,9 @@ static void TransOpArg(const OperatorBase* src_op, OperatorBase* dst_op, ...@@ -65,10 +64,9 @@ static void TransOpArg(const OperatorBase* src_op, OperatorBase* dst_op,
int src_end = src_format == nullptr ? src_arg_idx + 1 int src_end = src_format == nullptr ? src_arg_idx + 1
: src_format->at(src_arg_idx + 1); : src_format->at(src_arg_idx + 1);
for (int i = src_begin; i < src_end; ++i) { for (int i = src_begin; i < src_end; ++i) {
std::string s = is_grad ? src_inout[i] + OperatorBase::GRAD_VAR_SUFFIX() std::string s =
: arg.ignore_gradient() is_grad ? src_inout[i] + kGradVarSuffix
? OperatorBase::EMPTY_VAR_NAME() : (arg.ignore_gradient() ? kEmptyVarName : src_inout[i]);
: src_inout[i];
dst_inout.emplace_back(s); dst_inout.emplace_back(s);
} }
if (dst_format != nullptr) { if (dst_format != nullptr) {
......
...@@ -83,24 +83,21 @@ TEST(GradOpBuilder, MutiInOut) { ...@@ -83,24 +83,21 @@ TEST(GradOpBuilder, MutiInOut) {
EXPECT_EQ(grad_test_op->Input("Out1"), "out1"); EXPECT_EQ(grad_test_op->Input("Out1"), "out1");
EXPECT_EQ(grad_test_op->Inputs("Out2_mult"), EXPECT_EQ(grad_test_op->Inputs("Out2_mult"),
std::vector<std::string>({"out2_1", "out2_2"})); std::vector<std::string>({"out2_1", "out2_2"}));
EXPECT_EQ(grad_test_op->Input("Out1" + f::OperatorBase::GRAD_VAR_SUFFIX()), EXPECT_EQ(grad_test_op->Input("Out1" + f::kGradVarSuffix),
"out1" + f::OperatorBase::GRAD_VAR_SUFFIX()); "out1" + f::kGradVarSuffix);
EXPECT_EQ( EXPECT_EQ(grad_test_op->Inputs("Out2_mult" + f::kGradVarSuffix),
grad_test_op->Inputs("Out2_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()), std::vector<std::string>(
std::vector<std::string>( {"out2_1" + f::kGradVarSuffix, "out2_2" + f::kGradVarSuffix}));
{"out2_1" + f::OperatorBase::GRAD_VAR_SUFFIX(),
"out2_2" + f::OperatorBase::GRAD_VAR_SUFFIX()}));
ASSERT_EQ(grad_test_op->outputs_.size(), 5UL); ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
EXPECT_EQ(grad_test_op->Output("In1" + f::OperatorBase::GRAD_VAR_SUFFIX()), EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix),
"in1" + f::OperatorBase::GRAD_VAR_SUFFIX()); "in1" + f::kGradVarSuffix);
EXPECT_EQ( EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix),
grad_test_op->Outputs("In2_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()), std::vector<std::string>({"in2_1" + f::kGradVarSuffix,
std::vector<std::string>({"in2_1" + f::OperatorBase::GRAD_VAR_SUFFIX(), "in2_2" + f::kGradVarSuffix,
"in2_2" + f::OperatorBase::GRAD_VAR_SUFFIX(), "in2_3" + f::kGradVarSuffix}));
"in2_3" + f::OperatorBase::GRAD_VAR_SUFFIX()})); EXPECT_EQ(grad_test_op->Output("In3" + f::kGradVarSuffix),
EXPECT_EQ(grad_test_op->Output("In3" + f::OperatorBase::GRAD_VAR_SUFFIX()), "in3" + f::kGradVarSuffix);
"in3" + f::OperatorBase::GRAD_VAR_SUFFIX());
} }
TEST(GradOpBuilder, IOIgnoredInGradient) { TEST(GradOpBuilder, IOIgnoredInGradient) {
...@@ -116,30 +113,25 @@ TEST(GradOpBuilder, IOIgnoredInGradient) { ...@@ -116,30 +113,25 @@ TEST(GradOpBuilder, IOIgnoredInGradient) {
ASSERT_EQ(grad_test_op->inputs_.size(), 5UL + 3UL + 3UL); ASSERT_EQ(grad_test_op->inputs_.size(), 5UL + 3UL + 3UL);
EXPECT_EQ(grad_test_op->Input("In1"), "in1"); EXPECT_EQ(grad_test_op->Input("In1"), "in1");
EXPECT_EQ(grad_test_op->Inputs("In2_mult"), EXPECT_EQ(grad_test_op->Inputs("In2_mult"),
std::vector<std::string>({f::OperatorBase::EMPTY_VAR_NAME(), std::vector<std::string>({f::kEmptyVarName, f::kEmptyVarName}));
f::OperatorBase::EMPTY_VAR_NAME()}));
EXPECT_EQ(grad_test_op->Inputs("In3_mult"), EXPECT_EQ(grad_test_op->Inputs("In3_mult"),
std::vector<std::string>({"in3_1", "in3_2"})); std::vector<std::string>({"in3_1", "in3_2"}));
EXPECT_EQ(grad_test_op->Inputs("Out1_mult"), EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
std::vector<std::string>({"out1_1", "out1_2"})); std::vector<std::string>({"out1_1", "out1_2"}));
EXPECT_EQ(grad_test_op->Input("Out2"), f::OperatorBase::EMPTY_VAR_NAME()); EXPECT_EQ(grad_test_op->Input("Out2"), f::kEmptyVarName);
EXPECT_EQ( EXPECT_EQ(grad_test_op->Inputs("Out1_mult" + f::kGradVarSuffix),
grad_test_op->Inputs("Out1_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()), std::vector<std::string>(
std::vector<std::string>( {"out1_1" + f::kGradVarSuffix, "out1_2" + f::kGradVarSuffix}));
{"out1_1" + f::OperatorBase::GRAD_VAR_SUFFIX(), EXPECT_EQ(grad_test_op->Input("Out2" + f::kGradVarSuffix),
"out1_2" + f::OperatorBase::GRAD_VAR_SUFFIX()})); "out2" + f::kGradVarSuffix);
EXPECT_EQ(grad_test_op->Input("Out2" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"out2" + f::OperatorBase::GRAD_VAR_SUFFIX());
ASSERT_EQ(grad_test_op->outputs_.size(), 5UL); ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
EXPECT_EQ(grad_test_op->Output("In1" + f::OperatorBase::GRAD_VAR_SUFFIX()), EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix),
"in1" + f::OperatorBase::GRAD_VAR_SUFFIX()); "in1" + f::kGradVarSuffix);
EXPECT_EQ( EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix),
grad_test_op->Outputs("In2_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()), std::vector<std::string>(
std::vector<std::string>({"in2_1" + f::OperatorBase::GRAD_VAR_SUFFIX(), {"in2_1" + f::kGradVarSuffix, "in2_2" + f::kGradVarSuffix}));
"in2_2" + f::OperatorBase::GRAD_VAR_SUFFIX()})); EXPECT_EQ(grad_test_op->Outputs("In3_mult" + f::kGradVarSuffix),
EXPECT_EQ( std::vector<std::string>(
grad_test_op->Outputs("In3_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()), {"in3_1" + f::kGradVarSuffix, "in3_2" + f::kGradVarSuffix}));
std::vector<std::string>({"in3_1" + f::OperatorBase::GRAD_VAR_SUFFIX(),
"in3_2" + f::OperatorBase::GRAD_VAR_SUFFIX()}));
} }
...@@ -12,24 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,24 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
import "attr_type.proto"; import "attribute.proto";
// AttrDesc is used to describe Attributes of an Operator. It contain's // AttrDesc is used to describe Attributes of an Operator. It contain's
// name, type, and value of Attribute. // name, type, and value of Attribute.
// //
// e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0 // e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
message AttrDesc { message AttrDesc {
required string name = 1; required string name = 1;
required AttrType type = 2; required AttrType type = 2;
optional int32 i = 3; optional int32 i = 3;
optional float f = 4; optional float f = 4;
optional string s = 5; optional string s = 5;
repeated int32 ints = 6; repeated int32 ints = 6;
repeated float floats = 7; repeated float floats = 7;
repeated string strings = 8; repeated string strings = 8;
}; };
// Protocol Message to describe an Operator. // Protocol Message to describe an Operator.
...@@ -42,15 +42,15 @@ message AttrDesc { ...@@ -42,15 +42,15 @@ message AttrDesc {
// 3rd-party language can build this proto message and call // 3rd-party language can build this proto message and call
// AddOp(const OpDesc& op_desc) of Paddle core to create an Operator. // AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
message OpDesc { message OpDesc {
// input names of this Operator. // input names of this Operator.
repeated string inputs = 1; repeated string inputs = 1;
// output names of this Operator. // output names of this Operator.
repeated string outputs = 2; repeated string outputs = 2;
// type of this Operator, such as "add", "sub", "fc". // type of this Operator, such as "add", "sub", "fc".
required string type = 3; required string type = 3;
// Attributes of this Operator. e.g., scale=3.0 in cosine op. // Attributes of this Operator. e.g., scale=3.0 in cosine op.
repeated AttrDesc attrs = 4; repeated AttrDesc attrs = 4;
}; };
\ No newline at end of file
...@@ -15,100 +15,102 @@ limitations under the License. */ ...@@ -15,100 +15,102 @@ limitations under the License. */
// Protocol Message for 3rd-party language binding. // Protocol Message for 3rd-party language binding.
// //
// Paddle Python package will use `OpProto` to generate op creation methods. // Paddle Python package will use `OpProto` to generate op creation methods.
// The op creation methods take user's input and generate `OpDesc` proto message, // The op creation methods take user's input and generate `OpDesc` proto
// message,
// then pass `OpDesc` to C++ side and create Op pointer. // then pass `OpDesc` to C++ side and create Op pointer.
// //
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
import "attr_type.proto"; import "attribute.proto";
// Attribute protocol message for 3rd-party language binding. // Attribute protocol message for 3rd-party language binding.
// It will store the Op support what attribute and what type. // It will store the Op support what attribute and what type.
message AttrProto { message AttrProto {
// Supported attribute name. e.g. `scale` for cosine op. // Supported attribute name. e.g. `scale` for cosine op.
required string name = 1; required string name = 1;
// Supported attribute type. // Supported attribute type.
required AttrType type = 2; required AttrType type = 2;
// Supported attribute comments. It helps 3rd-party language generate doc-string. // Supported attribute comments. It helps 3rd-party language generate
required string comment = 3; // doc-string.
required string comment = 3;
// If that attribute is generated, it means the Paddle third language // If that attribute is generated, it means the Paddle third language
// binding has responsibility to fill that attribute. End-User should // binding has responsibility to fill that attribute. End-User should
// not set that attribute. // not set that attribute.
optional bool generated = 4 [default=false]; optional bool generated = 4 [ default = false ];
} }
// Input or output message for 3rd-party language binding. // Input or output message for 3rd-party language binding.
// It contains parameter name and its comments. // It contains parameter name and its comments.
message VarProto { message VarProto {
// Input or output name in that op creation function. // Input or output name in that op creation function.
// e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names. // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
required string name = 1; required string name = 1;
// The comment for that input. It helps 3rd-party language generate doc-string. // The comment for that input. It helps 3rd-party language generate
required string comment = 2; // doc-string.
required string comment = 2;
// Is that input/output could be a list or not.
// If so, that Op should write a attributed named `input_format` or // Is that input/output could be a list or not.
// `output_format`. // If so, that Op should write a attributed named `input_format` or
// // `output_format`.
// e.g. //
// If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W` // e.g.
// could be multiple, so the multiple of `X` and `W` is True, and OpDesc // If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
// will hold a attribute of them. // could be multiple, so the multiple of `X` and `W` is True, and OpDesc
// // will hold a attribute of them.
// The Op desc of same fc could be //
// { // The Op desc of same fc could be
// "type": "fc", // {
// "input": ["X1", "X2", "W1", "W2", "b"], // "type": "fc",
// "output": "fc.out", // "input": ["X1", "X2", "W1", "W2", "b"],
// "attrs" : { // "output": "fc.out",
// "input_format": [0, 2, 4, 5] // "attrs" : {
// } // "input_format": [0, 2, 4, 5]
// } // }
// // }
optional bool multiple = 3 [default=false]; //
optional bool multiple = 3 [ default = false ];
// It marks that output is a temporary output. That output is not used by
// user, but used by other op internally as input. If other op is not use // It marks that output is a temporary output. That output is not used by
// that output, it could be optimized early. // user, but used by other op internally as input. If other op is not use
// // that output, it could be optimized early.
// Attribute temporary_index will be set in OpDesc if there is some //
// outputs are temporary. // Attribute temporary_index will be set in OpDesc if there is some
// // outputs are temporary.
// output = [ "xxx.out1", "xxx.tmp", "xxx.out2"], //
// attrs = { // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
// "temporary_index": [1] // attrs = {
// } // "temporary_index": [1]
optional bool temporary = 4 [default=false]; // }
optional bool temporary = 4 [ default = false ];
// The gradient of operator can be ignored immediately
// e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2 // The gradient of operator can be ignored immediately
// can be ignored for the future optimized on graph. // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
optional bool ignore_gradient = 6; // can be ignored for the future optimized on graph.
optional bool ignore_gradient = 6;
} }
// Op protocol message for 3rd-party language binding. // Op protocol message for 3rd-party language binding.
// It contains all information for generating op creation method. // It contains all information for generating op creation method.
message OpProto { message OpProto {
// The input information to generate op creation method. // The input information to generate op creation method.
repeated VarProto inputs = 1; repeated VarProto inputs = 1;
// The output information to generate op creation method. // The output information to generate op creation method.
repeated VarProto outputs = 2; repeated VarProto outputs = 2;
// The attribute information to generate op creation method. // The attribute information to generate op creation method.
repeated AttrProto attrs = 3; repeated AttrProto attrs = 3;
// The comments for that Op. It helps 3rd-party language generate // The comments for that Op. It helps 3rd-party language generate
// doc-string. The whole documentation of that Op is generated by comment, // doc-string. The whole documentation of that Op is generated by comment,
// inputs, outputs, attrs together. // inputs, outputs, attrs together.
required string comment = 4; required string comment = 4;
// The type of that Op.
required string type = 5;
// The type of that Op.
required string type = 5;
} }
...@@ -14,37 +14,8 @@ limitations under the License. */ ...@@ -14,37 +14,8 @@ limitations under the License. */
#include <paddle/framework/op_registry.h> #include <paddle/framework/op_registry.h>
namespace paddle { #include <vector>
namespace framework {
template <>
void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::INT);
}
template <>
void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::FLOAT);
}
template <>
void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::STRING);
}
template <> namespace paddle {
void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) { namespace framework {} // namespace framework
attr->set_type(paddle::framework::AttrType::INTS);
}
template <>
void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::FLOATS);
}
template <>
void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::STRINGS);
}
} // namespace framework
} // namespace paddle } // namespace paddle
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
#include <type_traits> #include <type_traits>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include "paddle/framework/attr_checker.h" #include "paddle/framework/attribute.h"
#include "paddle/framework/grad_op_builder.h" #include "paddle/framework/grad_op_builder.h"
#include "paddle/framework/op_desc.pb.h" #include "paddle/framework/op_desc.pb.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
...@@ -27,49 +27,6 @@ limitations under the License. */ ...@@ -27,49 +27,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// helper class to set attribute type
struct AttrTypeHelper {
template <typename T>
static void SetAttrType(AttrProto* attr);
static Attribute GetAttrValue(const AttrDesc& attr_desc) {
switch (attr_desc.type()) {
case paddle::framework::AttrType::INT: {
return attr_desc.i();
}
case paddle::framework::AttrType::FLOAT: {
return attr_desc.f();
}
case paddle::framework::AttrType::STRING: {
return attr_desc.s();
}
case paddle::framework::AttrType::INTS: {
std::vector<int> val(attr_desc.ints_size());
for (int i = 0; i < attr_desc.ints_size(); ++i) {
val[i] = attr_desc.ints(i);
}
return val;
}
case paddle::framework::AttrType::FLOATS: {
std::vector<float> val(attr_desc.floats_size());
for (int i = 0; i < attr_desc.floats_size(); ++i) {
val[i] = attr_desc.floats(i);
}
return val;
}
case paddle::framework::AttrType::STRINGS: {
std::vector<std::string> val(attr_desc.strings_size());
for (int i = 0; i < attr_desc.strings_size(); ++i) {
val[i] = attr_desc.strings(i);
}
return val;
}
}
PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
return boost::blank();
}
};
// this class not only make proto but also init attribute checkers. // this class not only make proto but also init attribute checkers.
class OpProtoAndCheckerMaker { class OpProtoAndCheckerMaker {
public: public:
...@@ -136,7 +93,7 @@ class OpProtoAndCheckerMaker { ...@@ -136,7 +93,7 @@ class OpProtoAndCheckerMaker {
*attr->mutable_name() = name; *attr->mutable_name() = name;
*attr->mutable_comment() = comment; *attr->mutable_comment() = comment;
attr->set_generated(generated); attr->set_generated(generated);
AttrTypeHelper::SetAttrType<T>(attr); attr->set_type(AttrTypeID<T>());
return op_checker_->AddAttrChecker<T>(name); return op_checker_->AddAttrChecker<T>(name);
} }
...@@ -297,12 +254,18 @@ class OpRegistry { ...@@ -297,12 +254,18 @@ class OpRegistry {
AttributeMap attrs; AttributeMap attrs;
for (auto& attr : op_desc.attrs()) { for (auto& attr : op_desc.attrs()) {
attrs[attr.name()] = AttrTypeHelper::GetAttrValue(attr); attrs[attr.name()] = GetAttrValue(attr);
} }
return CreateOp(op_desc.type(), inputs, outputs, attrs); return CreateOp(op_desc.type(), inputs, outputs, attrs);
} }
static bool SupportGPU(const std::string& op_type) {
OperatorWithKernel::OpKernelKey key;
key.place_ = platform::GPUPlace();
return OperatorWithKernel::AllOpKernels().at(op_type).count(key) != 0;
}
static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op) { static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op) {
PADDLE_ENFORCE(!op.IsNetOp(), PADDLE_ENFORCE(!op.IsNetOp(),
"Use framework::Backward to get backward ops"); "Use framework::Backward to get backward ops");
...@@ -341,7 +304,7 @@ class OpRegistry { ...@@ -341,7 +304,7 @@ class OpRegistry {
static void GenerateTempVariableName(OperatorBase* op) { static void GenerateTempVariableName(OperatorBase* op) {
static std::atomic<size_t> gUniqId(0UL); static std::atomic<size_t> gUniqId(0UL);
for (auto& outname : op->outputs_) { for (auto& outname : op->outputs_) {
if (outname == OperatorBase::TMP_VAR_NAME()) { if (outname == kTempVarName) {
outname += op->type_; outname += op->type_;
outname += "@"; outname += "@";
outname += std::to_string(gUniqId.fetch_add(1)); outname += std::to_string(gUniqId.fetch_add(1));
......
...@@ -22,14 +22,14 @@ namespace framework { ...@@ -22,14 +22,14 @@ namespace framework {
template <> template <>
Eigen::DefaultDevice& ExecutionContext::GetEigenDevice< Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
platform::CPUPlace, Eigen::DefaultDevice>() const { platform::CPUPlace, Eigen::DefaultDevice>() const {
return *device_context_.get_eigen_device<Eigen::DefaultDevice>(); return *device_context_->get_eigen_device<Eigen::DefaultDevice>();
} }
#ifndef PADDLE_ONLY_CPU #ifndef PADDLE_ONLY_CPU
template <> template <>
Eigen::GpuDevice& Eigen::GpuDevice&
ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const { ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
return *device_context_.get_eigen_device<Eigen::GpuDevice>(); return *device_context_->get_eigen_device<Eigen::GpuDevice>();
} }
#endif #endif
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/framework/attr_checker.h" #include "paddle/framework/attribute.h"
#include "paddle/framework/op_desc.pb.h" #include "paddle/framework/op_desc.pb.h"
#include "paddle/framework/op_proto.pb.h" #include "paddle/framework/op_proto.pb.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
...@@ -32,9 +32,29 @@ limitations under the License. */ ...@@ -32,9 +32,29 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
/// If a variable is a empty variable, that name will be used.
const std::string kEmptyVarName = "@EMPTY@";
/// If a variable is a temporary variable, that name will be set in Python,
/// but it will be convert to a unique name in scope after OpCreator.
const std::string kTempVarName = "@TEMP@";
/// If a variable's name has a certain suffix, it means that the
/// variable is the gradient of another varibale.
/// e.g. Variable "x@GRAD" is the gradient of varibale "x".
const std::string kGradVarSuffix = "@GRAD";
/// Variables with this suffix are supposed to be filled up with zeros.
const std::string kZeroVarSuffix = "@ZERO";
inline std::string GradVarName(const std::string& var_name) {
return var_name + kGradVarSuffix;
}
class OperatorBase; class OperatorBase;
class InferShapeContext; class InferShapeContext;
class ExecutionContext; class ExecutionContext;
/** /**
* OperatorBase has the basic element that Net will call to do computation. * OperatorBase has the basic element that Net will call to do computation.
* Only CreateOperator from OpRegistry will new Operator directly. User * Only CreateOperator from OpRegistry will new Operator directly. User
...@@ -43,25 +63,6 @@ class ExecutionContext; ...@@ -43,25 +63,6 @@ class ExecutionContext;
*/ */
class OperatorBase { class OperatorBase {
public: public:
/// If a variable is a empty variable, that name will be used.
static std::string EMPTY_VAR_NAME() { return "@EMPTY@"; }
/// If a variable is a temporary variable, that name will be set in Python,
/// but it will be convert to a unique name in scope after OpCreator.
static std::string TMP_VAR_NAME() { return "@TEMP@"; }
/// If a variable's name has a certain suffix, it means that the
/// variable is the gradient of another varibale.
/// e.g. Variable "x@GRAD" is the gradient of varibale "x".
static std::string GRAD_VAR_SUFFIX() { return "@GRAD"; }
static std::string GRAD_VAR_NAME(const std::string& name) {
return name + GRAD_VAR_SUFFIX();
}
/// Variables with this suffix are supposed to be filled up with zeros.
static std::string ZERO_VAR_SUFFIX() { return "@ZERO"; }
virtual ~OperatorBase() {} virtual ~OperatorBase() {}
template <typename T> template <typename T>
...@@ -173,7 +174,11 @@ class OperatorContext { ...@@ -173,7 +174,11 @@ class OperatorContext {
template <typename T> template <typename T>
T* Output(const size_t index) const { T* Output(const size_t index) const {
auto var = OutputVar(index); auto var = OutputVar(index);
PADDLE_ENFORCE(var != nullptr, "Output(%d) should not be nullptr", index); PADDLE_ENFORCE(
var != nullptr,
"Output(%d) not be nullptr, which means variable [%s] does not "
"exist in scope",
index, op_.outputs_[index]);
return var->GetMutable<T>(); return var->GetMutable<T>();
} }
...@@ -251,7 +256,7 @@ struct EigenDeviceConverter<platform::GPUPlace> { ...@@ -251,7 +256,7 @@ struct EigenDeviceConverter<platform::GPUPlace> {
class ExecutionContext : public OperatorContext { class ExecutionContext : public OperatorContext {
public: public:
ExecutionContext(const OperatorBase* op, const Scope& scope, ExecutionContext(const OperatorBase* op, const Scope& scope,
const platform::DeviceContext& device_context) const platform::DeviceContext* device_context)
: OperatorContext(op, scope), device_context_(device_context) {} : OperatorContext(op, scope), device_context_(device_context) {}
template <typename PlaceType, template <typename PlaceType,
...@@ -259,9 +264,9 @@ class ExecutionContext : public OperatorContext { ...@@ -259,9 +264,9 @@ class ExecutionContext : public OperatorContext {
typename EigenDeviceConverter<PlaceType>::EigenDeviceType> typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
DeviceType& GetEigenDevice() const; DeviceType& GetEigenDevice() const;
platform::Place GetPlace() const { return device_context_.GetPlace(); } platform::Place GetPlace() const { return device_context_->GetPlace(); }
const platform::DeviceContext& device_context_; const platform::DeviceContext* device_context_;
}; };
class OpKernel { class OpKernel {
...@@ -310,7 +315,7 @@ class OperatorWithKernel : public OperatorBase { ...@@ -310,7 +315,7 @@ class OperatorWithKernel : public OperatorBase {
void Run(const Scope& scope, void Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const final { const platform::DeviceContext& dev_ctx) const final {
auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx)); auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
opKernel->Compute(ExecutionContext(this, scope, dev_ctx)); opKernel->Compute(ExecutionContext(this, scope, &dev_ctx));
} }
static std::unordered_map<std::string /* op_type */, OpKernelMap>& static std::unordered_map<std::string /* op_type */, OpKernelMap>&
......
...@@ -157,22 +157,22 @@ class CPUKernalMultiInputsTest : public OpKernel { ...@@ -157,22 +157,22 @@ class CPUKernalMultiInputsTest : public OpKernel {
ASSERT_EQ(xs[2], "x2"); ASSERT_EQ(xs[2], "x2");
auto inVar0 = ctx.MultiInputVar("xs"); auto inVar0 = ctx.MultiInputVar("xs");
ASSERT_EQ(inVar0.size(), 3); ASSERT_EQ(inVar0.size(), 3U);
auto intVar1 = ctx.InputVar("k"); auto intVar1 = ctx.InputVar("k");
ASSERT_NE(intVar1, nullptr); ASSERT_NE(intVar1, nullptr);
auto outVar0 = ctx.MultiOutputVar("ys"); auto outVar0 = ctx.MultiOutputVar("ys");
ASSERT_EQ(outVar0.size(), 2); ASSERT_EQ(outVar0.size(), 2U);
auto inTensor0 = ctx.MultiInput<Tensor>("xs"); auto inTensor0 = ctx.MultiInput<Tensor>("xs");
ASSERT_EQ(inTensor0.size(), 3); ASSERT_EQ(inTensor0.size(), 3U);
auto intTensor1 = ctx.Input<Tensor>("k"); auto intTensor1 = ctx.Input<Tensor>("k");
ASSERT_NE(intTensor1, nullptr); ASSERT_NE(intTensor1, nullptr);
auto outTensor0 = ctx.MultiOutput<Tensor>("ys"); auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
ASSERT_EQ(outTensor0.size(), 2); ASSERT_EQ(outTensor0.size(), 2U);
auto k = ctx.op_.Input("k"); auto k = ctx.op_.Input("k");
ASSERT_EQ(k, "k0"); ASSERT_EQ(k, "k0");
......
...@@ -32,7 +32,7 @@ limitations under the License. */ ...@@ -32,7 +32,7 @@ limitations under the License. */
namespace py = pybind11; namespace py = pybind11;
USE_OP(add_two); USE_OP(add_two);
USE_OP(onehot_cross_entropy); USE_OP_CPU(onehot_cross_entropy);
USE_OP_WITHOUT_KERNEL(fc); USE_OP_WITHOUT_KERNEL(fc);
USE_OP(sgd); USE_OP(sgd);
USE_OP(mul); USE_OP(mul);
...@@ -164,8 +164,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -164,8 +164,8 @@ All parameter, weight, gradient are variables in Paddle.
m.def_submodule( m.def_submodule(
"var_names", "var_names",
"The module will return special predefined variable name in Paddle") "The module will return special predefined variable name in Paddle")
.def("empty", OperatorBase::EMPTY_VAR_NAME) .def("empty", []() { return kEmptyVarName; })
.def("temp", OperatorBase::TMP_VAR_NAME); .def("temp", []() { return kTempVarName; });
// clang-format off // clang-format off
py::class_<paddle::platform::DeviceContext>(m, "DeviceContext") py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
.def_static("create", .def_static("create",
...@@ -201,6 +201,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -201,6 +201,8 @@ All parameter, weight, gradient are variables in Paddle.
return OpRegistry::CreateOp(desc); return OpRegistry::CreateOp(desc);
}); });
operator_base.def_static("support_gpu", &OpRegistry::SupportGPU);
operator_base.def("backward", operator_base.def("backward",
[](const OperatorBase &forwardOp, [](const OperatorBase &forwardOp,
const std::unordered_set<std::string> &no_grad_vars) { const std::unordered_set<std::string> &no_grad_vars) {
......
...@@ -18,10 +18,10 @@ limitations under the License. */ ...@@ -18,10 +18,10 @@ limitations under the License. */
namespace paddle { namespace paddle {
TEST(BlockExpandForward, real) { TEST(BlockExpandForward, real) {
for (size_t batchSize : {5, 32}) { for (size_t batchSize : {5}) {
for (size_t channels : {1, 5, 32}) { for (size_t channels : {1, 5}) {
for (size_t inputHeight : {5, 33, 100}) { for (size_t inputHeight : {5, 33}) {
for (size_t inputWidth : {5, 32, 96}) { for (size_t inputWidth : {5, 32}) {
for (size_t block : {1, 3, 5}) { for (size_t block : {1, 3, 5}) {
for (size_t stride : {1, 2}) { for (size_t stride : {1, 2}) {
for (size_t padding : {0, 1}) { for (size_t padding : {0, 1}) {
...@@ -61,10 +61,10 @@ TEST(BlockExpandForward, real) { ...@@ -61,10 +61,10 @@ TEST(BlockExpandForward, real) {
} }
TEST(BlockExpandBackward, real) { TEST(BlockExpandBackward, real) {
for (size_t batchSize : {5, 32}) { for (size_t batchSize : {5}) {
for (size_t channels : {1, 5, 32}) { for (size_t channels : {1, 5}) {
for (size_t inputHeight : {5, 33, 100}) { for (size_t inputHeight : {5, 33}) {
for (size_t inputWidth : {5, 32, 96}) { for (size_t inputWidth : {5, 32}) {
for (size_t block : {1, 3, 5}) { for (size_t block : {1, 3, 5}) {
for (size_t stride : {1, 2}) { for (size_t stride : {1, 2}) {
for (size_t padding : {0, 1}) { for (size_t padding : {0, 1}) {
......
...@@ -32,7 +32,7 @@ TEST(BufferTest, SequenceIdArg) { ...@@ -32,7 +32,7 @@ TEST(BufferTest, SequenceIdArg) {
sizeOfValuType(VALUE_TYPE_INT32)); sizeOfValuType(VALUE_TYPE_INT32));
SequenceIdArg buffer(memory.getBuf(), shape); SequenceIdArg buffer(memory.getBuf(), shape);
EXPECT_EQ(buffer.data(), memory.getBuf()); EXPECT_EQ(buffer.data(), memory.getBuf());
EXPECT_EQ(buffer.numSeqs(), 9); EXPECT_EQ(buffer.numSeqs(), 9U);
} }
} // namespace paddle } // namespace paddle
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "ContextProjectionOp.h" #include "ContextProjectionOp.h"
#include "hl_base.h"
namespace paddle { namespace paddle {
...@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input, ...@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
int block_size = blockDim.x; int block_size = blockDim.x;
int sequenceId = blockIdx.x; int sequenceId = blockIdx.x;
int seq_start = sequence[sequenceId]; int seq_start = sequence[sequenceId];
int seq_end = sequence[sequenceId+1]; int seq_end = sequence[sequenceId + 1];
real value = 0; real value = 0;
int instances = seq_end - seq_start + context_length - 1; int instances = seq_end - seq_start + context_length - 1;
...@@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input, ...@@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input,
} else if ((i + context_start) >= (seq_end - seq_start)) { } else if ((i + context_start) >= (seq_end - seq_start)) {
if (padding) { if (padding) {
value = value =
weight[(begin_pad + i + context_start - (seq_end - seq_start)) * weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
input_dim + idx]; input_dim +
idx];
} else { } else {
continue; continue;
} }
...@@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input, ...@@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input,
int outx = (i - context_length) < 0 ? i : (context_length - 1); int outx = (i - context_length) < 0 ? i : (context_length - 1);
int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1)); int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
real* output_r = real* output_r =
output + outy * input_dim * context_length + outx * input_dim; output + outy * input_dim * context_length + outx * input_dim;
for (int j = outy; j < seq_end - seq_start; j++) { for (int j = outy; j < seq_end - seq_start; j++) {
output_r[idx] += value; output_r[idx] += value;
if (j - outy == outx) break; if (j - outy == outx) break;
...@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input, ...@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
dim3 grid(blocks_x, blocks_y); dim3 grid(blocks_x, blocks_y);
if (weight) { if (weight) {
KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>> KeContextProjectionForward<true><<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, sequence, weight, output, input_dim, input,
context_length, context_start, begin_pad); sequence,
} else { weight,
KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>> output,
(input, sequence, weight, output, input_dim, input_dim,
context_length, context_start, begin_pad); context_length,
context_start,
begin_pad);
} else {
KeContextProjectionForward<false><<<grid, threads, 0, STREAM_DEFAULT>>>(
input,
sequence,
weight,
output,
input_dim,
context_length,
context_start,
begin_pad);
} }
CHECK_SYNC("hl_context_projection_forward failed"); CHECK_SYNC("hl_context_projection_forward failed");
} }
...@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad, ...@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
int block_size = blockDim.x; int block_size = blockDim.x;
int sequenceId = blockIdx.x; int sequenceId = blockIdx.x;
int seq_start = sequence[sequenceId]; int seq_start = sequence[sequenceId];
int seq_end = sequence[sequenceId+1]; int seq_end = sequence[sequenceId + 1];
real value = 0; real value = 0;
int instances = seq_end - seq_start + context_length - 1; int instances = seq_end - seq_start + context_length - 1;
...@@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad, ...@@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
int outx = (i - context_length) < 0 ? i : (context_length - 1); int outx = (i - context_length) < 0 ? i : (context_length - 1);
int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1)); int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
real* output_r = real* output_r =
out + outy * input_dim * context_length + outx * input_dim; out + outy * input_dim * context_length + outx * input_dim;
for (int j = outy; j < seq_end - seq_start; j++) { for (int j = outy; j < seq_end - seq_start; j++) {
value += output_r[idx]; value += output_r[idx];
if (j - outy == outx) break; if (j - outy == outx) break;
...@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad, ...@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
int blocks_y = 1; int blocks_y = 1;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(blocks_x, blocks_y); dim3 grid(blocks_x, blocks_y);
KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>> KeContextProjectionBackwardData<<<grid, threads, 0, STREAM_DEFAULT>>>(
(out_grad, sequence, input_grad, input_dim, context_length, context_start); out_grad, sequence, input_grad, input_dim, context_length, context_start);
CHECK_SYNC("hl_context_projection_backward_data failed"); CHECK_SYNC("hl_context_projection_backward_data failed");
} }
...@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad, ...@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
context_start); context_start);
} }
template<int THREADS_X, int THREADS_Y> template <int THREADS_X, int THREADS_Y>
__global__ void KeContextProjectionBackwardWeight(const real* out_grad, __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
const int* sequence, const int* sequence,
real* w_grad, real* w_grad,
...@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, ...@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
if (weight_idx < w_dim) { if (weight_idx < w_dim) {
for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) { for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
int seq_start = sequence[seqId]; int seq_start = sequence[seqId];
int seq_end = sequence[seqId+1]; int seq_end = sequence[seqId + 1];
output_r = const_cast<real*>(out_grad) output_r =
+ seq_start * w_dim * context_length; const_cast<real*>(out_grad) + seq_start * w_dim * context_length;
if (context_start < 0) { if (context_start < 0) {
if (padId + context_start < 0) { if (padId + context_start < 0) {
instanceId = padId; instanceId = padId;
} else { } else {
// begin_pad > 0; // begin_pad > 0;
instanceId = (padId - begin_pad) + instanceId =
(seq_end - seq_start) - context_start; (padId - begin_pad) + (seq_end - seq_start) - context_start;
} }
} else { } else {
if (padId + (seq_end - seq_start) < context_start) { if (padId + (seq_end - seq_start) < context_start) {
...@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, ...@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
} }
} }
int outx = (instanceId - context_length) < 0 ? int outx =
instanceId : (context_length - 1); (instanceId - context_length) < 0 ? instanceId : (context_length - 1);
int outy = (instanceId - context_length) < 0 ? int outy = (instanceId - context_length) < 0
0 : (instanceId - (context_length - 1)); ? 0
: (instanceId - (context_length - 1));
output_r += outy * w_dim * context_length + outx * w_dim; output_r += outy * w_dim * context_length + outx * w_dim;
for (int j = outy; j < seq_end - seq_start; j++) { for (int j = outy; j < seq_end - seq_start; j++) {
value += output_r[weight_idx]; value += output_r[weight_idx];
...@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, ...@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
} }
__syncthreads(); __syncthreads();
for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) { for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) {
if (idy < stride) { if (idy < stride) {
sum_s[idy][idx] += sum_s[idy + stride][idx]; sum_s[idy][idx] += sum_s[idy + stride][idx];
} }
...@@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad, ...@@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad,
dim3 threads(threads_x, threads_y); dim3 threads(threads_x, threads_y);
dim3 grid(blocks_x, 1); dim3 grid(blocks_x, 1);
KeContextProjectionBackwardWeight<32, 32> KeContextProjectionBackwardWeight<32,
<<< grid, threads, 0, STREAM_DEFAULT >>> 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
(out_grad, sequence, w_grad, num_sequences, w_dim, out_grad,
context_length, context_start, begin_pad); sequence,
w_grad,
num_sequences,
w_dim,
context_length,
context_start,
begin_pad);
CHECK_SYNC("hl_context_projection_backward_weight failed"); CHECK_SYNC("hl_context_projection_backward_weight failed");
} }
template <> template <>
void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>( void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
const GpuMatrix& out_grad, GpuMatrix& w_grad,
GpuMatrix& w_grad, const GpuIVector& seq_vec,
const GpuIVector& seq_vec, size_t context_length,
size_t context_length, int context_start,
int context_start, size_t total_pad,
size_t total_pad, size_t begin_pad) {
size_t begin_pad) {
hl_context_projection_backward_weight(out_grad.getData(), hl_context_projection_backward_weight(out_grad.getData(),
seq_vec.getData(), seq_vec.getData(),
w_grad.getData(), w_grad.getData(),
...@@ -376,23 +395,18 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad, ...@@ -376,23 +395,18 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
size_t begin_pad, size_t begin_pad,
bool is_padding, bool is_padding,
size_t total_pad) { size_t total_pad) {
if (in_grad) { if (in_grad) {
ContextProjectionBackwardData<DEVICE_TYPE_GPU>( ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
out_grad, out_grad, in_grad, sequence, context_length, context_start);
in_grad, }
sequence, if (is_padding && w_grad) {
context_length, ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(out_grad,
context_start); w_grad,
} sequence,
if (is_padding && w_grad) { context_length,
ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>( context_start,
out_grad, total_pad,
w_grad, begin_pad);
sequence,
context_length,
context_start,
total_pad,
begin_pad);
} }
} }
......
...@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "CosSimOp.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_device_functions.cuh" #include "hl_device_functions.cuh"
#include "CosSimOp.h"
namespace paddle { namespace paddle {
template<int block_size> template <int block_size>
__global__ void KeCosSim(real* output, __global__ void KeCosSim(real* output,
const real* input1, const real* input1,
const real* input2, const real* input2,
...@@ -78,8 +78,8 @@ void hlCossim(real* output, ...@@ -78,8 +78,8 @@ void hlCossim(real* output,
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(1, input1_height); dim3 grid(1, input1_height);
KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>> KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, input1, input2, width, input1_height, input2_height, scale); output, input1, input2, width, input1_height, input2_height, scale);
CHECK_SYNC("hlCossim failed"); CHECK_SYNC("hlCossim failed");
} }
...@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat, ...@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale); hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
} }
template<int block_size> template <int block_size>
__global__ void KeCosSimDerivative(const real* grad, __global__ void KeCosSimDerivative(const real* grad,
const real* output, const real* output,
const real* prev_out_x, const real* prev_out_x,
...@@ -148,14 +148,13 @@ __global__ void KeCosSimDerivative(const real* grad, ...@@ -148,14 +148,13 @@ __global__ void KeCosSimDerivative(const real* grad,
if (xy[0] == 0) { if (xy[0] == 0) {
real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0])); real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
for (int index = tid; index < width; index += block_size) { for (int index = tid; index < width; index += block_size) {
prev_grad_x[index] += prev_grad_x[index] += scale * grad[ty] * prev_out_y[index] * reciprocal;
scale * grad[ty] * prev_out_y[index] * reciprocal;
if (input2_height > 1) { if (input2_height > 1) {
prev_grad_y[index] += prev_grad_y[index] += scale * grad[ty] * prev_out_x[index] * reciprocal;
scale * grad[ty] * prev_out_x[index] * reciprocal;
} else { } else {
paddle::paddleAtomicAdd(prev_grad_y + index, paddle::paddleAtomicAdd(
scale * grad[ty] * prev_out_x[index] * reciprocal); prev_grad_y + index,
scale * grad[ty] * prev_out_x[index] * reciprocal);
} }
} }
} else { } else {
...@@ -163,17 +162,18 @@ __global__ void KeCosSimDerivative(const real* grad, ...@@ -163,17 +162,18 @@ __global__ void KeCosSimDerivative(const real* grad,
real reciprocalSquareSumX = 1.0 / xx[0]; real reciprocalSquareSumX = 1.0 / xx[0];
real reciprocalSquareSumY = 1.0 / yy[0]; real reciprocalSquareSumY = 1.0 / yy[0];
for (int index = tid; index < width; index += block_size) { for (int index = tid; index < width; index += block_size) {
prev_grad_x[index] += output[ty] * grad[ty] * prev_grad_x[index] +=
(prev_out_y[index] * reciprocalXY - output[ty] * grad[ty] * (prev_out_y[index] * reciprocalXY -
prev_out_x[index] * reciprocalSquareSumX); prev_out_x[index] * reciprocalSquareSumX);
if (input2_height > 1) { if (input2_height > 1) {
prev_grad_y[index] += output[ty] * grad[ty] * prev_grad_y[index] +=
(prev_out_x[index] * reciprocalXY - output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
prev_out_y[index] * reciprocalSquareSumY); prev_out_y[index] * reciprocalSquareSumY);
} else { } else {
paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] * paddle::paddleAtomicAdd(
(prev_out_x[index] * reciprocalXY - prev_grad_y + index,
prev_out_y[index] * reciprocalSquareSumY)); output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
prev_out_y[index] * reciprocalSquareSumY));
} }
} }
} }
...@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad, ...@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
const int block_size = 256; const int block_size = 256;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(1, input1_height); dim3 grid(1, input1_height);
KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>> KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width, grad,
input1_height, input2_height, scale); output,
prev_out_x,
prev_out_y,
prev_grad_x,
prev_grad_y,
width,
input1_height,
input2_height,
scale);
CHECK_SYNC("hlCossimDerivate failed"); CHECK_SYNC("hlCossimDerivate failed");
} }
...@@ -214,9 +222,9 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad, ...@@ -214,9 +222,9 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
real scale) { real scale) {
CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() && CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
in2_val.getData() && in1_grad.getData() && in2_grad.getData()); in2_val.getData() && in1_grad.getData() && in2_grad.getData());
CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ &&
&& in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_) in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
<< "Matrix types are not equally GPU"; << "Matrix types are not equally GPU";
size_t dim = in1_val.getWidth(); size_t dim = in1_val.getWidth();
const real* grad = out_grad.getData(); const real* grad = out_grad.getData();
......
...@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "CropOp.h" #include "CropOp.h"
#include "hl_base.h"
namespace paddle { namespace paddle {
__global__ void KeCrop(real* outputs, const real* inputs, __global__ void KeCrop(real* outputs,
int inC, int inH, int inW, const real* inputs,
int cropC, int cropH, int cropW, int inC,
int outC, int outH, int outW, int nthreads) { int inH,
int inW,
int cropC,
int cropH,
int cropW,
int outC,
int outH,
int outW,
int nthreads) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) { if (idx < nthreads) {
const int w = idx % outW; const int w = idx % outW;
...@@ -35,12 +43,12 @@ __global__ void KeCrop(real* outputs, const real* inputs, ...@@ -35,12 +43,12 @@ __global__ void KeCrop(real* outputs, const real* inputs,
template <> template <>
void Crop<DEVICE_TYPE_GPU>(real* outputs, void Crop<DEVICE_TYPE_GPU>(real* outputs,
const real* inputs, const real* inputs,
const TensorShape inShape, const TensorShape inShape,
const TensorShape outShape, const TensorShape outShape,
const FuncConfig& conf) { const FuncConfig& conf) {
std::vector<uint32_t> crop_corner = std::vector<uint32_t> crop_corner =
conf.get<std::vector<uint32_t>>("crop_corner"); conf.get<std::vector<uint32_t>>("crop_corner");
int cropC = crop_corner[1]; int cropC = crop_corner[1];
int cropH = crop_corner[2]; int cropH = crop_corner[2];
int cropW = crop_corner[3]; int cropW = crop_corner[3];
...@@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs, ...@@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
int blockSize = 1024; int blockSize = 1024;
int gridSize = (nth + blockSize - 1) / blockSize; int gridSize = (nth + blockSize - 1) / blockSize;
KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
(outputs, inputs, inC, inH, inW, cropC, cropH, cropW, inputs,
outC, outH, outW, nth); inC,
inH,
inW,
cropC,
cropH,
cropW,
outC,
outH,
outW,
nth);
CHECK_SYNC("Crop"); CHECK_SYNC("Crop");
} }
__global__ void KeCropDiff(const real* inGrad, real* outGrad, __global__ void KeCropDiff(const real* inGrad,
int inC, int inH, int inW, real* outGrad,
int cropC, int cropH, int cropW, int inC,
int outC, int outH, int outW, int nthreads) { int inH,
int inW,
int cropC,
int cropH,
int cropW,
int outC,
int outH,
int outW,
int nthreads) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) { if (idx < nthreads) {
const int w = idx % inW; const int w = idx % inW;
...@@ -84,12 +109,12 @@ __global__ void KeCropDiff(const real* inGrad, real* outGrad, ...@@ -84,12 +109,12 @@ __global__ void KeCropDiff(const real* inGrad, real* outGrad,
template <> template <>
void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad, void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
real* outGrad, real* outGrad,
const TensorShape inShape, const TensorShape inShape,
const TensorShape outShape, const TensorShape outShape,
const FuncConfig& conf) { const FuncConfig& conf) {
std::vector<uint32_t> crop_corner = std::vector<uint32_t> crop_corner =
conf.get<std::vector<uint32_t>>("crop_corner"); conf.get<std::vector<uint32_t>>("crop_corner");
int cropC = crop_corner[1]; int cropC = crop_corner[1];
int cropH = crop_corner[2]; int cropH = crop_corner[2];
int cropW = crop_corner[3]; int cropW = crop_corner[3];
...@@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad, ...@@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
int blockSize = 1024; int blockSize = 1024;
int gridSize = (nth + blockSize - 1) / blockSize; int gridSize = (nth + blockSize - 1) / blockSize;
KeCropDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCropDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
(inGrad, outGrad, inC, inH, inW, cropC, cropH, cropW, outGrad,
outC, outH, outW, nth); inC,
inH,
inW,
cropC,
cropH,
cropW,
outC,
outH,
outW,
nth);
CHECK_SYNC("CropGrad"); CHECK_SYNC("CropGrad");
} }
......
...@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "CrossMapNormalOp.h" #include "CrossMapNormalOp.h"
#include "hl_base.h"
namespace paddle { namespace paddle {
__global__ void KeCMRNormFillScale(size_t imageSize, const real* in, __global__ void KeCMRNormFillScale(size_t imageSize,
real* scale, size_t channels, const real* in,
size_t height, size_t width, size_t size, real* scale,
size_t channels,
size_t height,
size_t width,
size_t size,
real alpha) { real alpha) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < imageSize) { if (idx < imageSize) {
...@@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in, ...@@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
} }
} }
__global__ void KeCMRNormOutput(size_t inputSize, const real* in, __global__ void KeCMRNormOutput(size_t inputSize,
const real* scale, real negative_beta, const real* in,
const real* scale,
real negative_beta,
real* out) { real* out) {
const int index = threadIdx.x + blockIdx.x * blockDim.x; const int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < inputSize) { if (index < inputSize) {
...@@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs, ...@@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
size_t imageSize = numSamples * height * width; size_t imageSize = numSamples * height * width;
int blockSize = 1024; int blockSize = 1024;
int gridSize = (imageSize + 1024 - 1) / 1024; int gridSize = (imageSize + 1024 - 1) / 1024;
KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
(imageSize, inputs, denoms, channels, height, width, size, scale); imageSize, inputs, denoms, channels, height, width, size, scale);
size_t inputSize = numSamples * height * width *channels; size_t inputSize = numSamples * height * width * channels;
blockSize = 1024; blockSize = 1024;
gridSize = (inputSize + 1024 - 1) / 1024; gridSize = (inputSize + 1024 - 1) / 1024;
KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
(inputSize, inputs, denoms, -pow, outputs); inputSize, inputs, denoms, -pow, outputs);
CHECK_SYNC("CrossMapNormal"); CHECK_SYNC("CrossMapNormal");
} }
__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data, __global__ void KeCMRNormDiff(size_t imageSize,
const real* top_data, const real* scale, const real* bottom_data,
const real* top_diff, size_t channels, const real* top_data,
size_t height, size_t width, size_t size, const real* scale,
real negative_beta, real cache_ratio, const real* top_diff,
real* bottom_diff ) { size_t channels,
size_t height,
size_t width,
size_t size,
real negative_beta,
real cache_ratio,
real* bottom_diff) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < imageSize) { if (idx < imageSize) {
const int w = idx % width; const int w = idx % width;
...@@ -113,17 +125,17 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data, ...@@ -113,17 +125,17 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
while (index < channels + post_pad) { while (index < channels + post_pad) {
if (index < channels) { if (index < channels) {
accum += top_diff[index * step] * top_data[index * step] / accum += top_diff[index * step] * top_data[index * step] /
scale[index * step]; scale[index * step];
} }
if (index >= size) { if (index >= size) {
accum -= top_diff[(index - size) * step] * accum -= top_diff[(index - size) * step] *
top_data[(index - size) * step] / scale[(index - size) * step]; top_data[(index - size) * step] / scale[(index - size) * step];
} }
if (index >= post_pad) { if (index >= post_pad) {
bottom_diff[(index - post_pad) * step] += bottom_diff[(index - post_pad) * step] +=
top_diff[(index - post_pad) * step] * top_diff[(index - post_pad) * step] *
pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio * pow(scale[(index - post_pad) * step], negative_beta) -
bottom_data[(index - post_pad) * step] * accum; cache_ratio * bottom_data[(index - post_pad) * step] * accum;
} }
++index; ++index;
} }
...@@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad, ...@@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
int blockSize = 1024; int blockSize = 1024;
int gridSize = (imageSize + 1024 - 1) / 1024; int gridSize = (imageSize + 1024 - 1) / 1024;
KeCMRNormDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCMRNormDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(imageSize,
(imageSize, inputsValue, outputsValue, denoms, outputsGrad, channels, inputsValue,
height, width, size, -pow, 2.0f * pow * scale, inputsGrad); outputsValue,
denoms,
outputsGrad,
channels,
height,
width,
size,
-pow,
2.0f * pow * scale,
inputsGrad);
CHECK_SYNC("CrossMapNormalGrad"); CHECK_SYNC("CrossMapNormalGrad");
} }
......
...@@ -18,11 +18,11 @@ limitations under the License. */ ...@@ -18,11 +18,11 @@ limitations under the License. */
namespace paddle { namespace paddle {
TEST(CrossMapNormal, real) { TEST(CrossMapNormal, real) {
for (size_t numSamples : {5, 32}) { for (size_t numSamples : {5}) {
for (size_t channels : {1, 5, 32}) { for (size_t channels : {1, 5}) {
for (size_t imgSizeH : {5, 33, 100}) { for (size_t imgSizeH : {5, 33}) {
for (size_t imgSizeW : {5, 32, 96}) { for (size_t imgSizeW : {5, 32}) {
for (size_t size : {1, 2, 3, 5, 7}) { for (size_t size : {1, 3}) {
VLOG(3) << " numSamples=" << numSamples << " channels=" << channels VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
<< " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
<< " size=" << size; << " size=" << size;
...@@ -48,11 +48,11 @@ TEST(CrossMapNormal, real) { ...@@ -48,11 +48,11 @@ TEST(CrossMapNormal, real) {
} }
TEST(CrossMapNormalGrad, real) { TEST(CrossMapNormalGrad, real) {
for (size_t numSamples : {5, 32}) { for (size_t numSamples : {5}) {
for (size_t channels : {1, 5, 32}) { for (size_t channels : {1, 5}) {
for (size_t imgSizeH : {5, 33, 100}) { for (size_t imgSizeH : {5, 33}) {
for (size_t imgSizeW : {5, 32, 96}) { for (size_t imgSizeW : {5, 32}) {
for (size_t size : {1, 2, 3, 5, 7}) { for (size_t size : {1, 3}) {
VLOG(3) << " numSamples=" << numSamples << " channels=" << channels VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
<< " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
<< " size=" << size; << " size=" << size;
......
...@@ -20,17 +20,25 @@ namespace paddle { ...@@ -20,17 +20,25 @@ namespace paddle {
// CUDA kernel to compute the depthwise convolution forward pass // CUDA kernel to compute the depthwise convolution forward pass
template <class T> template <class T>
__global__ __global__ void ConvolutionDepthwiseForward(const int nthreads,
void ConvolutionDepthwiseForward(const int nthreads, const T* const inputData,
const T* const inputData, const T* const filterData, const T* const filterData,
const int batchSize, const int outputChannels, const int outputHeight, const int batchSize,
const int outputWidth, const int inputChannels, const int inputHeight, const int outputChannels,
const int inputWidth, const int filterMultiplier, const int filterHeight, const int outputHeight,
const int filterWidth, const int strideH, const int strideW, const int outputWidth,
const int paddingH, const int paddingW, T* const outputData) { const int inputChannels,
const int inputHeight,
int index = const int inputWidth,
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; const int filterMultiplier,
const int filterHeight,
const int filterWidth,
const int strideH,
const int strideW,
const int paddingH,
const int paddingW,
T* const outputData) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) { if (index < nthreads) {
const int batch = index / outputChannels / outputHeight / outputWidth; const int batch = index / outputChannels / outputHeight / outputWidth;
...@@ -45,32 +53,36 @@ void ConvolutionDepthwiseForward(const int nthreads, ...@@ -45,32 +53,36 @@ void ConvolutionDepthwiseForward(const int nthreads,
const int w_in_start = -paddingW + w_out * strideW; const int w_in_start = -paddingW + w_out * strideW;
const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1; const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1; const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
if ((h_in_start >= 0) && (h_in_end < inputHeight) if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) &&
&& (w_in_start >= 0) && (w_in_end < inputWidth)) { (w_in_end < inputWidth)) {
for (int kh = 0; kh < filterHeight; ++kh) { for (int kh = 0; kh < filterHeight; ++kh) {
for (int kw = 0; kw < filterWidth; ++kw) { for (int kw = 0; kw < filterWidth; ++kw) {
const int h_in = -paddingH + h_out * strideH + kh; const int h_in = -paddingH + h_out * strideH + kh;
const int w_in = -paddingW + w_out * strideW + kw; const int w_in = -paddingW + w_out * strideW + kw;
const int offset = ((batch * inputChannels + c_in) const int offset =
* inputHeight + h_in) * inputWidth + w_in; ((batch * inputChannels + c_in) * inputHeight + h_in) *
value += (*weight) * inputData[offset]; inputWidth +
++weight; w_in;
} value += (*weight) * inputData[offset];
++weight;
} }
}
} else { } else {
for (int kh = 0; kh < filterHeight; ++kh) { for (int kh = 0; kh < filterHeight; ++kh) {
for (int kw = 0; kw < filterWidth; ++kw) { for (int kw = 0; kw < filterWidth; ++kw) {
const int h_in = -paddingH + h_out * strideH + kh; const int h_in = -paddingH + h_out * strideH + kh;
const int w_in = -paddingW + w_out * strideW + kw; const int w_in = -paddingW + w_out * strideW + kw;
if ((h_in >= 0) && (h_in < inputHeight) if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
&& (w_in >= 0) && (w_in < inputWidth)) { (w_in < inputWidth)) {
const int offset = ((batch * inputChannels + c_in) const int offset =
* inputHeight + h_in) * inputWidth + w_in; ((batch * inputChannels + c_in) * inputHeight + h_in) *
value += (*weight) * inputData[offset]; inputWidth +
} w_in;
++weight; value += (*weight) * inputData[offset];
} }
} ++weight;
}
}
} }
outputData[index] = value; outputData[index] = value;
} }
...@@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads, ...@@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads,
// CUDA kernel to compute the depthwise convolution backprop w.r.t input. // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template <class T> template <class T>
__global__ __global__ void ConvolutionDepthwiseInputBackward(const int nthreads,
void ConvolutionDepthwiseInputBackward(const int nthreads, const T* const top_diff,
const T* const top_diff, const T* const weight_data, const T* const weight_data,
const int num, const int outputChannels, const int outputHeight, const int num,
const int outputWidth, const int inputChannels, const int inputHeight, const int outputChannels,
const int inputWidth, const int filterMultiplier, const int filterHeight, const int outputHeight,
const int filterWidth, const int strideH, const int strideW, const int outputWidth,
const int paddingH, const int paddingW, T* const bottom_diff) { const int inputChannels,
int index = const int inputHeight,
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; const int inputWidth,
const int filterMultiplier,
const int filterHeight,
const int filterWidth,
const int strideH,
const int strideW,
const int paddingH,
const int paddingW,
T* const bottom_diff) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) { if (index < nthreads) {
const int batch = index / inputChannels / inputHeight / inputWidth; const int batch = index / inputChannels / inputHeight / inputWidth;
const int c_in = (index / inputHeight / inputWidth) % inputChannels; const int c_in = (index / inputHeight / inputWidth) % inputChannels;
...@@ -96,65 +117,80 @@ void ConvolutionDepthwiseInputBackward(const int nthreads, ...@@ -96,65 +117,80 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
const int c_out_start = c_in * filterMultiplier; const int c_out_start = c_in * filterMultiplier;
int h_out_start = (h_in - filterHeight + paddingH + strideH)/strideH; int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
h_out_start = 0 > h_out_start ? 0 : h_out_start; h_out_start = 0 > h_out_start ? 0 : h_out_start;
int h_out_end = (h_in + paddingH)/strideH; int h_out_end = (h_in + paddingH) / strideH;
h_out_end = outputHeight - 1 < h_out_end? outputHeight - 1 : h_out_end; h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
int w_out_start = (w_in - filterWidth + paddingW + strideW)/strideW; int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
w_out_start = 0 > w_out_start ? 0 : w_out_start; w_out_start = 0 > w_out_start ? 0 : w_out_start;
int w_out_end = (w_in + paddingW)/strideW; int w_out_end = (w_in + paddingW) / strideW;
w_out_end = outputWidth - 1 < w_out_end? outputWidth - 1 : w_out_end; w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
T value = 0; T value = 0;
for (int c_out = c_out_start; for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
c_out < c_out_start + filterMultiplier; c_out ++) { c_out++) {
for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
const int filter_h = h_in + paddingH - h_out * strideH; const int filter_h = h_in + paddingH - h_out * strideH;
for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
const int filter_w = w_in + paddingW - w_out * strideW; const int filter_w = w_in + paddingW - w_out * strideW;
const int filter_offset = c_out * filterHeight * filterWidth const int filter_offset = c_out * filterHeight * filterWidth +
+ filter_h * filterWidth + filter_w; filter_h * filterWidth + filter_w;
const int top_diff_offset = ((batch * outputChannels + c_out) * const int top_diff_offset =
outputHeight + h_out)* outputWidth + w_out; ((batch * outputChannels + c_out) * outputHeight + h_out) *
value += top_diff[top_diff_offset] * weight_data[filter_offset]; outputWidth +
} w_out;
value += top_diff[top_diff_offset] * weight_data[filter_offset];
} }
}
} }
bottom_diff[index] += value; bottom_diff[index] += value;
} }
} }
// CUDA kernel to compute the depthwise convolution backprop w.r.t filter. // CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
template <class T> template <class T>
__global__ __global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads, const int nthreads,
const T* const top_diff, const T* const inputData, const T* const top_diff,
const int num, const int outputChannels, const int outputHeight, const T* const inputData,
const int outputWidth, const int inputChannels, const int inputHeight, const int num,
const int inputWidth, const int filterMultiplier, const int filterHeight, const int outputChannels,
const int filterWidth, const int strideH, const int strideW, const int outputHeight,
const int paddingH, const int paddingW, T* const buffer_data) { const int outputWidth,
int index = const int inputChannels,
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; const int inputHeight,
const int inputWidth,
const int filterMultiplier,
const int filterHeight,
const int filterWidth,
const int strideH,
const int strideW,
const int paddingH,
const int paddingW,
T* const buffer_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) { if (index < nthreads) {
const int h_out = (index / outputWidth) % outputHeight; const int h_out = (index / outputWidth) % outputHeight;
const int w_out = index % outputWidth; const int w_out = index % outputWidth;
const int kh = (index / filterWidth / outputHeight / outputWidth) const int kh =
% filterHeight; (index / filterWidth / outputHeight / outputWidth) % filterHeight;
const int kw = (index / outputHeight / outputWidth) % filterWidth; const int kw = (index / outputHeight / outputWidth) % filterWidth;
const int h_in = -paddingH + h_out * strideH + kh; const int h_in = -paddingH + h_out * strideH + kh;
const int w_in = -paddingW + w_out * strideW + kw; const int w_in = -paddingW + w_out * strideW + kw;
if ((h_in >= 0) && (h_in < inputHeight) if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
&& (w_in >= 0) && (w_in < inputWidth)) { (w_in < inputWidth)) {
const int c_out = index / const int c_out =
(filterHeight * filterWidth * outputHeight * outputWidth); index / (filterHeight * filterWidth * outputHeight * outputWidth);
const int c_in = c_out / filterMultiplier; const int c_in = c_out / filterMultiplier;
const int batch = num_i; const int batch = num_i;
const int top_offset = ((batch * outputChannels + c_out) * const int top_offset =
outputHeight + h_out) * outputWidth + w_out; ((batch * outputChannels + c_out) * outputHeight + h_out) *
const int bottom_offset = ((batch * inputChannels + c_in) outputWidth +
* inputHeight + h_in) * inputWidth + w_in; w_out;
const int bottom_offset =
((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
w_in;
buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset]; buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
} else { } else {
buffer_data[index] = 0; buffer_data[index] = 0;
...@@ -163,170 +199,169 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads, ...@@ -163,170 +199,169 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
} }
template <class T> template <class T>
class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T>{ class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
public: public:
void operator()(const T* inputData, void operator()(const T* inputData,
const T* filterData, const T* filterData,
int batchSize, int batchSize,
int outputChannels, int outputChannels,
int outputHeight, int outputHeight,
int outputWidth, int outputWidth,
int inputChannels, int inputChannels,
int inputHeight, int inputHeight,
int inputWidth, int inputWidth,
int filterMultiplier, int filterMultiplier,
int filterHeight, int filterHeight,
int filterWidth, int filterWidth,
int strideH, int strideH,
int strideW, int strideW,
int paddingH, int paddingH,
int paddingW, int paddingW,
T* outputData){ T* outputData) {
int outputSize = batchSize * outputChannels * outputHeight * outputWidth; int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
size_t blocks = (outputSize + 1024 -1) / 1024; size_t blocks = (outputSize + 1024 - 1) / 1024;
size_t blockX = 512; size_t blockX = 512;
size_t blockY = (blocks+512-1)/512; size_t blockY = (blocks + 512 - 1) / 512;
dim3 threads(1024, 1); dim3 threads(1024, 1);
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
ConvolutionDepthwiseForward<T> ConvolutionDepthwiseForward<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<< grid, threads, 0, STREAM_DEFAULT >>>( outputSize,
outputSize, inputData,
inputData, filterData,
filterData, batchSize,
batchSize, outputChannels,
outputChannels, outputHeight,
outputHeight, outputWidth,
outputWidth, inputChannels,
inputChannels, inputHeight,
inputHeight, inputWidth,
inputWidth, filterMultiplier,
filterMultiplier, filterHeight,
filterHeight, filterWidth,
filterWidth, strideH,
strideH, strideW,
strideW, paddingH,
paddingH, paddingW,
paddingW, outputData);
outputData); }
}
}; };
template <class T> template <class T>
class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T>{ class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
public: public:
void operator()(const T* outputGrad, void operator()(const T* outputGrad,
const T* filterData, const T* filterData,
int batchSize, int batchSize,
int outputChannels, int outputChannels,
int outputHeight, int outputHeight,
int outputWidth, int outputWidth,
int inputChannels, int inputChannels,
int inputHeight, int inputHeight,
int inputWidth, int inputWidth,
int filterMultiplier, int filterMultiplier,
int filterHeight, int filterHeight,
int filterWidth, int filterWidth,
int strideH, int strideH,
int strideW, int strideW,
int paddingH, int paddingH,
int paddingW, int paddingW,
T* inputGrad){ T* inputGrad) {
int inputSize = batchSize * inputChannels * inputHeight * inputWidth; int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
size_t blocks = (inputSize + 1024 -1) / 1024; size_t blocks = (inputSize + 1024 - 1) / 1024;
size_t blockX = 512; size_t blockX = 512;
size_t blockY = (blocks+512-1)/512; size_t blockY = (blocks + 512 - 1) / 512;
dim3 threads(1024, 1); dim3 threads(1024, 1);
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
ConvolutionDepthwiseInputBackward<T> ConvolutionDepthwiseInputBackward<T>
// NOLINT_NEXT_LINE(whitespace/operators) // NOLINT_NEXT_LINE(whitespace/operators)
<<< grid, threads, 0, STREAM_DEFAULT >>>( <<<grid, threads, 0, STREAM_DEFAULT>>>(inputSize,
inputSize, outputGrad,
outputGrad, filterData,
filterData, batchSize,
batchSize, outputChannels,
outputChannels, outputHeight,
outputHeight, outputWidth,
outputWidth, inputChannels,
inputChannels, inputHeight,
inputHeight, inputWidth,
inputWidth, filterMultiplier,
filterMultiplier, filterHeight,
filterHeight, filterWidth,
filterWidth, strideH,
strideH, strideW,
strideW, paddingH,
paddingH, paddingW,
paddingW, inputGrad);
inputGrad); }
}
}; };
template <class T> template <class T>
class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> { class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
public: public:
void operator()(const T* outputGrad, void operator()(const T* outputGrad,
const T* inputData, const T* inputData,
int batchSize, int batchSize,
int outputChannels, int outputChannels,
int outputHeight, int outputHeight,
int outputWidth, int outputWidth,
int inputChannels, int inputChannels,
int inputHeight, int inputHeight,
int inputWidth, int inputWidth,
int filterMultiplier, int filterMultiplier,
int filterHeight, int filterHeight,
int filterWidth, int filterWidth,
int strideH, int strideH,
int strideW, int strideW,
int paddingH, int paddingH,
int paddingW, int paddingW,
T* colData, T* colData,
T* filterGrad){ T* filterGrad) {
int colDataSize = outputChannels * filterHeight * filterWidth int colDataSize = outputChannels * filterHeight * filterWidth *
* outputHeight * outputWidth; outputHeight * outputWidth;
size_t blocks = (colDataSize + 1024 -1) / 1024; size_t blocks = (colDataSize + 1024 - 1) / 1024;
size_t blockX = 512; size_t blockX = 512;
size_t blockY = (blocks+512-1)/512; size_t blockY = (blocks + 512 - 1) / 512;
dim3 threads(1024, 1); dim3 threads(1024, 1);
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth, BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
1, filterGrad, false, true); 1,
filterGrad,
false,
true);
for (int i = 0; i < batchSize; i++) { for (int i = 0; i < batchSize; i++) {
ConvolutionDepthwiseFilterBackward<T> ConvolutionDepthwiseFilterBackward<
<<< grid, threads, 0, STREAM_DEFAULT >>>( T><<<grid, threads, 0, STREAM_DEFAULT>>>(i,
i, colDataSize,
colDataSize, outputGrad,
outputGrad, inputData,
inputData, batchSize,
batchSize, outputChannels,
outputChannels, outputHeight,
outputHeight, outputWidth,
outputWidth, inputChannels,
inputChannels, inputHeight,
inputHeight, inputWidth,
inputWidth, filterMultiplier,
filterMultiplier, filterHeight,
filterHeight, filterWidth,
filterWidth, strideH,
strideH, strideW,
strideW, paddingH,
paddingH, paddingW,
paddingW, colData);
colData); int K = outputHeight * outputWidth;
int K = outputHeight * outputWidth; int M = colDataSize / K;
int M = colDataSize / K;
BaseMatrix colMatrix(M, K, colData, false, true); BaseMatrix colMatrix(M, K, colData, false, true);
filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0); filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
}
} }
}
}; };
#ifdef PADDLE_TYPE_DOUBLE #ifdef PADDLE_TYPE_DOUBLE
......
...@@ -24,14 +24,14 @@ void FunctionApi(typename Tensor<real, DType>::Matrix& output, ...@@ -24,14 +24,14 @@ void FunctionApi(typename Tensor<real, DType>::Matrix& output,
template <> template <>
void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) { void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
EXPECT_EQ(output.getHeight(), 100); EXPECT_EQ(output.getHeight(), 100U);
EXPECT_EQ(output.getWidth(), 200); EXPECT_EQ(output.getWidth(), 200U);
} }
template <> template <>
void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) { void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
EXPECT_EQ(output.getHeight(), 10); EXPECT_EQ(output.getHeight(), 10U);
EXPECT_EQ(output.getWidth(), 20); EXPECT_EQ(output.getWidth(), 20U);
} }
template <DeviceType DType> template <DeviceType DType>
...@@ -85,14 +85,14 @@ void testBufferArgs(const BufferArgs& inputs, ...@@ -85,14 +85,14 @@ void testBufferArgs(const BufferArgs& inputs,
} }
void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) { void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
EXPECT_EQ(inputs.size(), 1); EXPECT_EQ(inputs.size(), 1U);
check(inputs[0]); check(inputs[0]);
} }
TEST(Arguments, Matrix) { TEST(Arguments, Matrix) {
MatrixPtr matrix = Matrix::create(100, 200); MatrixPtr matrix = Matrix::create(100, 200);
CheckBufferArg check = [=](const BufferArg& arg) { CheckBufferArg check = [=](const BufferArg& arg) {
EXPECT_EQ(arg.shape().ndims(), 2); EXPECT_EQ(arg.shape().ndims(), 2U);
EXPECT_EQ(arg.shape()[0], 100); EXPECT_EQ(arg.shape()[0], 100);
EXPECT_EQ(arg.shape()[1], 200); EXPECT_EQ(arg.shape()[1], 200);
EXPECT_EQ(arg.data(), matrix->getData()); EXPECT_EQ(arg.data(), matrix->getData());
......
...@@ -17,16 +17,21 @@ limitations under the License. */ ...@@ -17,16 +17,21 @@ limitations under the License. */
namespace paddle { namespace paddle {
template<class T> template <class T>
__global__ __global__ void im2col(const T* data_im,
void im2col(const T* data_im, int numOuts, int height, int width, int numOuts,
int blockH, int blockW, int height,
int strideH, int strideW, int width,
int paddingH, int paddingW, int blockH,
int height_col, int width_col, int blockW,
T* data_col) { int strideH,
int index = int strideW,
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; int paddingH,
int paddingW,
int height_col,
int width_col,
T* data_col) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < numOuts) { if (index < numOuts) {
int w_out = index % width_col; int w_out = index % width_col;
index /= width_col; index /= width_col;
...@@ -39,17 +44,17 @@ void im2col(const T* data_im, int numOuts, int height, int width, ...@@ -39,17 +44,17 @@ void im2col(const T* data_im, int numOuts, int height, int width,
data_col += (channel_out * height_col + h_out) * width_col + w_out; data_col += (channel_out * height_col + h_out) * width_col + w_out;
for (int i = 0; i < blockH; ++i) { for (int i = 0; i < blockH; ++i) {
for (int j = 0; j < blockW; ++j) { for (int j = 0; j < blockW; ++j) {
int rIdx = int(h_in+i); int rIdx = int(h_in + i);
int cIdx = int(w_in+j); int cIdx = int(w_in + j);
if ((rIdx-(int)paddingH) >= (int)height || if ((rIdx - (int)paddingH) >= (int)height ||
(rIdx-(int)paddingH) < 0 || (rIdx - (int)paddingH) < 0 ||
(cIdx-(int)paddingW) >= (int)width || (cIdx - (int)paddingW) >= (int)width ||
(cIdx-(int)paddingW) < 0) { (cIdx - (int)paddingW) < 0) {
*data_col = 0; *data_col = 0;
} else { } else {
rIdx = rIdx + channel_in*height - paddingH; rIdx = rIdx + channel_in * height - paddingH;
cIdx = cIdx - paddingW; cIdx = cIdx - paddingW;
*data_col = data_im[rIdx* width + cIdx]; *data_col = data_im[rIdx * width + cIdx];
} }
data_col += height_col * width_col; data_col += height_col * width_col;
} }
...@@ -82,60 +87,73 @@ public: ...@@ -82,60 +87,73 @@ public:
int outputWidth = colShape[4]; int outputWidth = colShape[4];
int numKernels = inputChannels * outputHeight * outputWidth; int numKernels = inputChannels * outputHeight * outputWidth;
int blocks = (numKernels + 1024 -1) / 1024; int blocks = (numKernels + 1024 - 1) / 1024;
int blockX = 512; int blockX = 512;
int blockY = (blocks + 512 - 1) / 512; int blockY = (blocks + 512 - 1) / 512;
dim3 threads(1024, 1); dim3 threads(1024, 1);
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
im2col<T><<< grid, threads, 0, STREAM_DEFAULT >>> im2col<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
(imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth, numKernels,
strideHeight, strideWidth, paddingHeight, paddingWidth, inputHeight,
outputHeight, outputWidth, colData); inputWidth,
filterHeight,
filterWidth,
strideHeight,
strideWidth,
paddingHeight,
paddingWidth,
outputHeight,
outputWidth,
colData);
CHECK_SYNC("Im2ColFunctor GPU failed"); CHECK_SYNC("Im2ColFunctor GPU failed");
} }
}; };
template<class T> template <class T>
__global__ __global__ void col2im(size_t n,
void col2im(size_t n, const T* data_col, size_t height, const T* data_col,
size_t width, size_t channels, size_t height,
size_t blockH, size_t blockW, size_t width,
size_t strideH, size_t strideW, size_t channels,
size_t paddingH, size_t paddingW, size_t blockH,
size_t height_col, size_t width_col, size_t blockW,
T* data_im) { size_t strideH,
size_t strideW,
size_t paddingH,
size_t paddingW,
size_t height_col,
size_t width_col,
T* data_im) {
size_t index = size_t index =
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < n) { if (index < n) {
T val = 0; T val = 0;
int w = int(index % width); int w = int(index % width);
int h = int((index / width) % height); int h = int((index / width) % height);
int c = int(index / (width * height)); int c = int(index / (width * height));
if ((w - (int)paddingW) >= 0 && if ((w - (int)paddingW) >= 0 &&
(w - (int)paddingW) < (width-2 * paddingW) && (w - (int)paddingW) < (width - 2 * paddingW) &&
(h - (int)paddingH) >= 0 && (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
(h - paddingH) < (height - 2 * paddingH)) {
// compute the start and end of the output // compute the start and end of the output
int w_col_start = int w_col_start =
(w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
int w_col_end = int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
min((int)(w / (int)strideW + 1), (int)(width_col));
int h_col_start = int h_col_start =
(h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
int h_col_end = min(int(h / strideH + 1), int(height_col)); int h_col_end = min(int(h / strideH + 1), int(height_col));
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
// the col location: [c * width * height + h_out, w_out] // the col location: [c * width * height + h_out, w_out]
int c_col = int(c * blockH* blockW) + \ int c_col = int(c * blockH * blockW) +
(h - h_col * (int)strideH) * (int)blockW + (h - h_col * (int)strideH) * (int)blockW +
(w - w_col * (int)strideW); (w - w_col * (int)strideW);
val += data_col[(c_col * height_col + h_col) * width_col + w_col]; val += data_col[(c_col * height_col + h_col) * width_col + w_col];
} }
} }
h -= paddingH; h -= paddingH;
w -= paddingW; w -= paddingW;
data_im[c*((width-2*paddingW) * (height-2*paddingH)) + data_im[c * ((width - 2 * paddingW) * (height - 2 * paddingH)) +
h*(width-2*paddingW) + w] += val; h * (width - 2 * paddingW) + w] += val;
} }
} }
} }
...@@ -164,32 +182,32 @@ public: ...@@ -164,32 +182,32 @@ public:
int outputHeight = colShape[3]; int outputHeight = colShape[3];
int outputWidth = colShape[4]; int outputWidth = colShape[4];
size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight) size_t numKernels = inputChannels * (inputHeight + 2 * paddingHeight) *
* (inputWidth + 2*paddingWidth); (inputWidth + 2 * paddingWidth);
size_t blocks = (numKernels + 1024 -1) / 1024; size_t blocks = (numKernels + 1024 - 1) / 1024;
size_t blockX = 512; size_t blockX = 512;
size_t blockY = (blocks+512-1)/512; size_t blockY = (blocks + 512 - 1) / 512;
dim3 threads(1024, 1); dim3 threads(1024, 1);
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
// To avoid involving atomic operations, we will launch one kernel per // To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions. // bottom dimension, and then in the kernel add up the top dimensions.
col2im<T><<< grid, threads, 0, STREAM_DEFAULT >>> col2im<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
(numKernels, numKernels,
colData, colData,
inputHeight + 2*paddingHeight, inputHeight + 2 * paddingHeight,
inputWidth + 2*paddingWidth, inputWidth + 2 * paddingWidth,
inputChannels, inputChannels,
filterHeight, filterHeight,
filterWidth, filterWidth,
strideHeight, strideHeight,
strideWidth, strideWidth,
paddingHeight, paddingHeight,
paddingWidth, paddingWidth,
outputHeight, outputHeight,
outputWidth, outputWidth,
imData); imData);
CHECK_SYNC("Col2ImFunctor GPU failed"); CHECK_SYNC("Col2ImFunctor GPU failed");
} }
}; };
...@@ -199,31 +217,35 @@ template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>; ...@@ -199,31 +217,35 @@ template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, float>; template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, float>;
template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, double>; template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, double>;
template<class T> template <class T>
__global__ __global__ void im2colOCF(const T* imData,
void im2colOCF(const T* imData, T* colData, T* colData,
int inputChannels, int inputChannels,
int inputHeight, int inputWidth, int inputHeight,
int filterHeight, int filterWidth, int inputWidth,
int strideHeight, int strideWidth, int filterHeight,
int paddingHeight, int paddingWidth, int filterWidth,
int outputHeight, int outputWidth) { int strideHeight,
int strideWidth,
int paddingHeight,
int paddingWidth,
int outputHeight,
int outputWidth) {
int swId = blockIdx.x; int swId = blockIdx.x;
int shId = blockIdx.y; int shId = blockIdx.y;
for (int channelId = threadIdx.z; for (int channelId = threadIdx.z; channelId < inputChannels;
channelId < inputChannels;
channelId += blockDim.z) { channelId += blockDim.z) {
for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
int widthOffset = idx + swId * strideWidth - paddingWidth; int widthOffset = idx + swId * strideWidth - paddingWidth;
int heightOffset = idy + shId * strideHeight - paddingHeight; int heightOffset = idy + shId * strideHeight - paddingHeight;
int imOffset = widthOffset + heightOffset * inputWidth int imOffset = widthOffset + heightOffset * inputWidth +
+ channelId * inputHeight * inputWidth; channelId * inputHeight * inputWidth;
int colOffset = idx + idy * filterWidth int colOffset = idx + idy * filterWidth +
+ channelId * filterHeight * filterWidth channelId * filterHeight * filterWidth +
+ (shId * outputWidth + swId) (shId * outputWidth + swId) *
* (inputChannels * filterHeight * filterWidth); (inputChannels * filterHeight * filterWidth);
if (heightOffset >= inputHeight || heightOffset < 0 || if (heightOffset >= inputHeight || heightOffset < 0 ||
widthOffset >= inputWidth || widthOffset < 0) { widthOffset >= inputWidth || widthOffset < 0) {
...@@ -279,39 +301,52 @@ public: ...@@ -279,39 +301,52 @@ public:
int blockDimZ = 1024 / blockDimX / blockDimY; int blockDimZ = 1024 / blockDimX / blockDimY;
dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels)); dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
dim3 grid(outputWidth, outputHeight); dim3 grid(outputWidth, outputHeight);
im2colOCF<T><<< grid, threads, 0, STREAM_DEFAULT >>> im2colOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
(imData, colData, inputChannels, inputHeight, inputWidth, colData,
filterHeight, filterWidth, strideHeight, strideWidth, inputChannels,
paddingHeight, paddingWidth, outputHeight, outputWidth); inputHeight,
inputWidth,
filterHeight,
filterWidth,
strideHeight,
strideWidth,
paddingHeight,
paddingWidth,
outputHeight,
outputWidth);
CHECK_SYNC("Im2ColFunctor GPU failed"); CHECK_SYNC("Im2ColFunctor GPU failed");
} }
}; };
template<class T> template <class T>
__global__ __global__ void col2imOCF(T* imData,
void col2imOCF(T* imData, const T* colData, const T* colData,
int inputChannels, int inputChannels,
int inputHeight, int inputWidth, int inputHeight,
int filterHeight, int filterWidth, int inputWidth,
int strideHeight, int strideWidth, int filterHeight,
int paddingHeight, int paddingWidth, int filterWidth,
int outputHeight, int outputWidth) { int strideHeight,
int strideWidth,
int paddingHeight,
int paddingWidth,
int outputHeight,
int outputWidth) {
int swId = blockIdx.x; int swId = blockIdx.x;
int shId = blockIdx.y; int shId = blockIdx.y;
for (int channelId = threadIdx.z; for (int channelId = threadIdx.z; channelId < inputChannels;
channelId < inputChannels;
channelId += blockDim.z) { channelId += blockDim.z) {
for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
int widthOffset = idx + swId * strideWidth - paddingWidth; int widthOffset = idx + swId * strideWidth - paddingWidth;
int heightOffset = idy + shId * strideHeight - paddingHeight; int heightOffset = idy + shId * strideHeight - paddingHeight;
int imOffset = widthOffset + heightOffset * inputWidth int imOffset = widthOffset + heightOffset * inputWidth +
+ channelId * inputHeight * inputWidth; channelId * inputHeight * inputWidth;
int colOffset = idx + idy * filterWidth int colOffset = idx + idy * filterWidth +
+ channelId * filterHeight * filterWidth channelId * filterHeight * filterWidth +
+ (shId * outputWidth + swId) (shId * outputWidth + swId) *
* (inputChannels * filterHeight * filterWidth); (inputChannels * filterHeight * filterWidth);
if (heightOffset >= 0 && heightOffset < inputHeight && if (heightOffset >= 0 && heightOffset < inputHeight &&
widthOffset >= 0 && widthOffset < inputWidth) { widthOffset >= 0 && widthOffset < inputWidth) {
...@@ -365,10 +400,19 @@ public: ...@@ -365,10 +400,19 @@ public:
int blockDimZ = 1024 / blockDimX / blockDimY; int blockDimZ = 1024 / blockDimX / blockDimY;
dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels)); dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
dim3 grid(outputWidth, outputHeight); dim3 grid(outputWidth, outputHeight);
col2imOCF<T><<< grid, threads, 0, STREAM_DEFAULT >>> col2imOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
(imData, colData, inputChannels, inputHeight, inputWidth, colData,
filterHeight, filterWidth, strideHeight, strideWidth, inputChannels,
paddingHeight, paddingWidth, outputHeight, outputWidth); inputHeight,
inputWidth,
filterHeight,
filterWidth,
strideHeight,
strideWidth,
paddingHeight,
paddingWidth,
outputHeight,
outputWidth);
CHECK_SYNC("Col2ImFunctor GPU failed"); CHECK_SYNC("Col2ImFunctor GPU failed");
} }
}; };
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "MulOp.h" #include "MulOp.h"
#include "hl_base.h"
#include "paddle/math/Matrix.h" #include "paddle/math/Matrix.h"
#include "paddle/math/SparseMatrix.h" #include "paddle/math/SparseMatrix.h"
......
...@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "PadOp.h" #include "PadOp.h"
#include "hl_base.h"
namespace paddle { namespace paddle {
__global__ void KePad(real* outputs, const real* inputs, __global__ void KePad(real* outputs,
int inC, int inH, int inW, const real* inputs,
int padc, int padh, int padw, int inC,
int outC, int outH, int outW, int nthreads) { int inH,
int inW,
int padc,
int padh,
int padw,
int outC,
int outH,
int outW,
int nthreads) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) { if (idx < nthreads) {
const int w = idx % inW; const int w = idx % inW;
...@@ -50,16 +58,33 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs, ...@@ -50,16 +58,33 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
int outC = inC + cstart + cend; int outC = inC + cstart + cend;
int outH = inH + hstart + hend; int outH = inH + hstart + hend;
int outW = inW + wstart + wend; int outW = inW + wstart + wend;
KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
(outputs, inputs, inC, inH, inW, cstart, hstart, wstart, inputs,
outC, outH, outW, nth); inC,
inH,
inW,
cstart,
hstart,
wstart,
outC,
outH,
outW,
nth);
CHECK_SYNC("Pad"); CHECK_SYNC("Pad");
} }
__global__ void KePadDiff(real* inGrad, const real* outGrad, __global__ void KePadDiff(real* inGrad,
int inC, int inH, int inW, const real* outGrad,
int padc, int padh, int padw, int inC,
int outC, int outH, int outW, int nthreads) { int inH,
int inW,
int padc,
int padh,
int padw,
int outC,
int outH,
int outW,
int nthreads) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) { if (idx < nthreads) {
const int w = idx % inW; const int w = idx % inW;
...@@ -89,9 +114,18 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad, ...@@ -89,9 +114,18 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
int outC = inC + cstart + cend; int outC = inC + cstart + cend;
int outH = inH + hstart + hend; int outH = inH + hstart + hend;
int outW = inW + wstart + wend; int outW = inW + wstart + wend;
KePadDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KePadDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
(inGrad, outGrad, inC, inH, inW, cstart, hstart, wstart, outGrad,
outC, outH, outW, nth); inC,
inH,
inW,
cstart,
hstart,
wstart,
outC,
outH,
outW,
nth);
CHECK_SYNC("PadGrad"); CHECK_SYNC("PadGrad");
} }
......
...@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "RowConvOp.h" #include "RowConvOp.h"
#include "hl_base.h"
namespace paddle { namespace paddle {
template<int BLOCK_H, int BLOCK_W> template <int BLOCK_H, int BLOCK_W>
__global__ void KeRowConv(real* y, const real* x, const real* w, __global__ void KeRowConv(real* y,
const int* starts, const int height, const int width, const real* x,
const int numSeq, const int context) { const real* w,
const int* starts,
const int height,
const int width,
const int numSeq,
const int context) {
const int tidx = threadIdx.x; const int tidx = threadIdx.x;
const int tidy = threadIdx.y; const int tidy = threadIdx.y;
const int blky = blockDim.y; const int blky = blockDim.y;
...@@ -30,7 +34,7 @@ __global__ void KeRowConv(real* y, const real* x, const real* w, ...@@ -30,7 +34,7 @@ __global__ void KeRowConv(real* y, const real* x, const real* w,
__shared__ real sw[BLOCK_H][BLOCK_W]; __shared__ real sw[BLOCK_H][BLOCK_W];
for (int i = tidy; i < context; i += blky) { for (int i = tidy; i < context; i += blky) {
sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0; sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
} }
__syncthreads(); __syncthreads();
...@@ -56,9 +60,14 @@ __global__ void KeRowConv(real* y, const real* x, const real* w, ...@@ -56,9 +60,14 @@ __global__ void KeRowConv(real* y, const real* x, const real* w,
} }
} }
__global__ void KeRowConv2(real* y, const real* x, const real* w, __global__ void KeRowConv2(real* y,
const int* starts, const int height, const int width, const real* x,
const int numSeq, const int context) { const real* w,
const int* starts,
const int height,
const int width,
const int numSeq,
const int context) {
const int tidx = threadIdx.x; const int tidx = threadIdx.x;
const int tidy = threadIdx.y; const int tidy = threadIdx.y;
const int blky = blockDim.y; const int blky = blockDim.y;
...@@ -84,8 +93,6 @@ __global__ void KeRowConv2(real* y, const real* x, const real* w, ...@@ -84,8 +93,6 @@ __global__ void KeRowConv2(real* y, const real* x, const real* w,
} }
} }
template <> template <>
void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out, void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
const GpuMatrix& in, const GpuMatrix& in,
...@@ -105,21 +112,24 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out, ...@@ -105,21 +112,24 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
dim3 dimGrid(DIVUP(width, dimBlock.x), 1); dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
if (contextLength <= 32) { if (contextLength <= 32) {
KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>> KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
(y, x, w, starts, height, width, numSeq, contextLength); y, x, w, starts, height, width, numSeq, contextLength);
} else { } else {
KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>> KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
(y, x, w, starts, height, width, numSeq, contextLength); y, x, w, starts, height, width, numSeq, contextLength);
} }
CHECK_SYNC("RowConv"); CHECK_SYNC("RowConv");
} }
template <int BLOCK_H, int BLOCK_W, int CONTEXT>
template<int BLOCK_H, int BLOCK_W, int CONTEXT> __global__ void KeRowConvBwWeight(real* dw,
__global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, const real* x,
const int* starts, const int height, const int width, const int numSeq, const real* dy,
const int context) { const int* starts,
const int height,
const int width,
const int numSeq,
const int context) {
const int tidx = threadIdx.x; const int tidx = threadIdx.x;
const int tidy = threadIdx.y; const int tidy = threadIdx.y;
const int blky = blockDim.y; const int blky = blockDim.y;
...@@ -138,21 +148,21 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, ...@@ -138,21 +148,21 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
const int start = starts[i]; const int start = starts[i];
const int end = starts[i + 1]; const int end = starts[i + 1];
const int steps = end - start; const int steps = end - start;
const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H; const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
for (int j = tidy; j < size; j += BLOCK_H) { for (int j = tidy; j < size; j += BLOCK_H) {
int xoff = gidx + tidx; int xoff = gidx + tidx;
int yoff = start + j; int yoff = start + j;
// transpose // transpose
sh_x[tidx][tidy] = (xoff < width && yoff < end) ? sh_x[tidx][tidy] =
x[yoff * width + xoff] : 0.0; (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? sh_dy[tidx][tidy + context - 1] =
dy[yoff * width + xoff] : 0.0; (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
__syncthreads(); __syncthreads();
if (tidy < (context - 1)) { if (tidy < (context - 1)) {
yoff = yoff - context + 1; yoff = yoff - context + 1;
sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? sh_dy[tidx][tidy] =
dy[yoff * width + xoff] : 0.0; (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
} }
__syncthreads(); __syncthreads();
...@@ -179,11 +189,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, ...@@ -179,11 +189,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
} }
} }
template<int BLOCK_H, int BLOCK_W> template <int BLOCK_H, int BLOCK_W>
__global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, __global__ void KeRowConvBwWeight2(real* dw,
const int* starts, const int height, const int width, const int numSeq, const real* x,
const int context) { const real* dy,
const int* starts,
const int height,
const int width,
const int numSeq,
const int context) {
const int tidx = threadIdx.x; const int tidx = threadIdx.x;
const int tidy = threadIdx.y; const int tidy = threadIdx.y;
const int gidx = blockIdx.x * blockDim.x; const int gidx = blockIdx.x * blockDim.x;
...@@ -196,19 +210,21 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, ...@@ -196,19 +210,21 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
const int end = starts[i + 1]; const int end = starts[i + 1];
const int steps = end - start; const int steps = end - start;
const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H; const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
for (int j = tidy; j < size; j += BLOCK_H) { for (int j = tidy; j < size; j += BLOCK_H) {
int xoff = gidx + tidx; int xoff = gidx + tidx;
int yoff = start + j; int yoff = start + j;
// transpose // transpose
sh_x[tidx][tidy] = (xoff < width && yoff < end) ? sh_x[tidx][tidy] =
x[yoff * width + xoff] : 0.0; (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
__syncthreads(); __syncthreads();
for (int t = 0; t < context; t++) { for (int t = 0; t < context; t++) {
sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && sh_dy[tidx][tidy] =
yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0; (xoff < width && (yoff - t) >= start && yoff - t < end)
? dy[(yoff - t) * width + xoff]
: 0.0;
__syncthreads(); __syncthreads();
real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx]; real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
...@@ -222,18 +238,22 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, ...@@ -222,18 +238,22 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
__syncthreads(); __syncthreads();
if (tidx == 0 && (gidx + tidy) < width) { if (tidx == 0 && (gidx + tidy) < width) {
dw[t*width + gidx + tidy] += val; dw[t * width + gidx + tidy] += val;
} }
} }
} }
} }
} }
template<int BLOCK_H, int BLOCK_W> template <int BLOCK_H, int BLOCK_W>
__global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, __global__ void KeRowConvBwData(real* dx,
const int* starts, const int height, const int width, const int numSeq, const real* w,
const int context) { const real* dy,
const int* starts,
const int height,
const int width,
const int numSeq,
const int context) {
const int tidx = threadIdx.x; const int tidx = threadIdx.x;
const int tidy = threadIdx.y; const int tidy = threadIdx.y;
const int blky = blockDim.y; const int blky = blockDim.y;
...@@ -242,7 +262,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, ...@@ -242,7 +262,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
__shared__ real sw[BLOCK_H][BLOCK_W]; __shared__ real sw[BLOCK_H][BLOCK_W];
for (int i = tidy; i < context; i += blky) { for (int i = tidy; i < context; i += blky) {
sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0; sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
} }
__syncthreads(); __syncthreads();
...@@ -266,10 +286,14 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, ...@@ -266,10 +286,14 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
} }
} }
__global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy, __global__ void KeRowConvBwData2(real* dx,
const int* starts, const int height, const int width, const int numSeq, const real* w,
const int context) { const real* dy,
const int* starts,
const int height,
const int width,
const int numSeq,
const int context) {
const int tidx = threadIdx.x; const int tidx = threadIdx.x;
const int tidy = threadIdx.y; const int tidy = threadIdx.y;
const int blky = blockDim.y; const int blky = blockDim.y;
...@@ -295,14 +319,13 @@ __global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy, ...@@ -295,14 +319,13 @@ __global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy,
} }
} }
template <> template <>
void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG, void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
const GpuMatrix& in, const GpuMatrix& in,
const GpuMatrix& filter, const GpuMatrix& filter,
GpuMatrix& inG, GpuMatrix& inG,
GpuMatrix& filterG, GpuMatrix& filterG,
const GpuIVector& seq) { const GpuIVector& seq) {
const size_t numSeq = seq.getSize() - 1; const size_t numSeq = seq.getSize() - 1;
const size_t contextLength = filter.getHeight(); const size_t contextLength = filter.getHeight();
const size_t height = in.getHeight(); const size_t height = in.getHeight();
...@@ -318,13 +341,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG, ...@@ -318,13 +341,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
dim3 dimGrid(DIVUP(width, dimBlock.x), 1); dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
real* dw = filterG.getData(); real* dw = filterG.getData();
if (contextLength <= 32) { if (contextLength <= 32) {
KeRowConvBwWeight<32, 32, 32> KeRowConvBwWeight<32, 32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>> dw, x, dy, starts, height, width, numSeq, contextLength);
(dw, x, dy, starts, height, width, numSeq, contextLength);
} else { } else {
KeRowConvBwWeight2<32, 32> KeRowConvBwWeight2<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>> dw, x, dy, starts, height, width, numSeq, contextLength);
(dw, x, dy, starts, height, width, numSeq, contextLength);
} }
} }
...@@ -333,13 +354,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG, ...@@ -333,13 +354,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
dim3 dimBlock2(32, 32); dim3 dimBlock2(32, 32);
dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1); dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
if (contextLength <= 64) { if (contextLength <= 64) {
KeRowConvBwData<32, 64> KeRowConvBwData<32, 64><<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>> dx, w, dy, starts, height, width, numSeq, contextLength);
(dx, w, dy, starts, height, width, numSeq, contextLength);
} else { } else {
KeRowConvBwData2 KeRowConvBwData2<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>> dx, w, dy, starts, height, width, numSeq, contextLength);
(dx, w, dy, starts, height, width, numSeq, contextLength);
} }
} }
......
...@@ -19,35 +19,35 @@ namespace paddle { ...@@ -19,35 +19,35 @@ namespace paddle {
TEST(TensorShape, Constructor) { TEST(TensorShape, Constructor) {
TensorShape t1; TensorShape t1;
EXPECT_EQ(t1.ndims(), 0); EXPECT_EQ(t1.ndims(), 0U);
EXPECT_EQ(t1.getElements(), 0); EXPECT_EQ(t1.getElements(), 0U);
TensorShape t2(3); TensorShape t2(3);
EXPECT_EQ(t2.ndims(), 3); EXPECT_EQ(t2.ndims(), 3U);
EXPECT_EQ(t2.getElements(), 1); EXPECT_EQ(t2.getElements(), 1U);
TensorShape t3({8, 10}); TensorShape t3({8, 10});
EXPECT_EQ(t3.ndims(), 2); EXPECT_EQ(t3.ndims(), 2U);
EXPECT_EQ(t3.getElements(), 80); EXPECT_EQ(t3.getElements(), 80U);
TensorShape t4(t3); TensorShape t4(t3);
EXPECT_EQ(t4.ndims(), t3.ndims()); EXPECT_EQ(t4.ndims(), t3.ndims());
EXPECT_EQ(t4.getElements(), t3.getElements()); EXPECT_EQ(t4.getElements(), t3.getElements());
TensorShape t5({1, 2, 3, 4, 5}); TensorShape t5({1, 2, 3, 4, 5});
EXPECT_EQ(t5.ndims(), 5); EXPECT_EQ(t5.ndims(), 5U);
EXPECT_EQ(t5.getElements(), 120); EXPECT_EQ(t5.getElements(), 120U);
} }
TEST(TensorShape, GetAndSet) { TEST(TensorShape, GetAndSet) {
TensorShape t({1, 2, 3}); TensorShape t({1, 2, 3});
EXPECT_EQ(t.ndims(), 3); EXPECT_EQ(t.ndims(), 3U);
EXPECT_EQ(t.getElements(), 6); EXPECT_EQ(t.getElements(), 6U);
EXPECT_EQ(t[1], 2); EXPECT_EQ(t[1], 2);
t.setDim(1, 100); t.setDim(1, 100);
EXPECT_EQ(t.getElements(), 300); EXPECT_EQ(t.getElements(), 300U);
EXPECT_EQ(t[1], 100); EXPECT_EQ(t[1], 100U);
} }
} // namespace paddle } // namespace paddle
...@@ -19,9 +19,9 @@ namespace paddle { ...@@ -19,9 +19,9 @@ namespace paddle {
TEST(TensorType, Matrix) { TEST(TensorType, Matrix) {
Tensor<real, DEVICE_TYPE_CPU>::Matrix matrix(100, 200); Tensor<real, DEVICE_TYPE_CPU>::Matrix matrix(100, 200);
EXPECT_EQ(matrix.getHeight(), 100); EXPECT_EQ(matrix.getHeight(), 100U);
EXPECT_EQ(matrix.getWidth(), 200); EXPECT_EQ(matrix.getWidth(), 200U);
EXPECT_EQ(matrix.getElementCnt(), 100 * 200); EXPECT_EQ(matrix.getElementCnt(), 100U * 200U);
EXPECT_EQ(matrix.useGpu(), false); EXPECT_EQ(matrix.useGpu(), false);
Tensor<real, DEVICE_TYPE_GPU>::Matrix testGpu(100, 200); Tensor<real, DEVICE_TYPE_GPU>::Matrix testGpu(100, 200);
...@@ -33,15 +33,15 @@ TEST(TensorType, Vector) { ...@@ -33,15 +33,15 @@ TEST(TensorType, Vector) {
Tensor<real, DEVICE_TYPE_GPU>::Vector gpuVector(100); Tensor<real, DEVICE_TYPE_GPU>::Vector gpuVector(100);
EXPECT_EQ(cpuVector.useGpu(), false); EXPECT_EQ(cpuVector.useGpu(), false);
EXPECT_EQ(gpuVector.useGpu(), true); EXPECT_EQ(gpuVector.useGpu(), true);
EXPECT_EQ(cpuVector.getSize(), 100); EXPECT_EQ(cpuVector.getSize(), 100U);
EXPECT_EQ(gpuVector.getSize(), 100); EXPECT_EQ(gpuVector.getSize(), 100U);
Tensor<int, DEVICE_TYPE_CPU>::Vector cpuIVector(100); Tensor<int, DEVICE_TYPE_CPU>::Vector cpuIVector(100);
Tensor<int, DEVICE_TYPE_GPU>::Vector gpuIVector(100); Tensor<int, DEVICE_TYPE_GPU>::Vector gpuIVector(100);
EXPECT_EQ(cpuIVector.useGpu(), false); EXPECT_EQ(cpuIVector.useGpu(), false);
EXPECT_EQ(gpuIVector.useGpu(), true); EXPECT_EQ(gpuIVector.useGpu(), true);
EXPECT_EQ(cpuIVector.getSize(), 100); EXPECT_EQ(cpuIVector.getSize(), 100U);
EXPECT_EQ(gpuIVector.getSize(), 100); EXPECT_EQ(gpuIVector.getSize(), 100U);
} }
TEST(TensorType, EmptyMatrix) { TEST(TensorType, EmptyMatrix) {
......
...@@ -49,9 +49,7 @@ class NNPACKConvFunction : public ConvFunctionBase { ...@@ -49,9 +49,7 @@ class NNPACKConvFunction : public ConvFunctionBase {
public: public:
void init(const FuncConfig& config) override { void init(const FuncConfig& config) override {
ConvFunctionBase::init(config); ConvFunctionBase::init(config);
CHECK_EQ(groups_, (size_t)1);
algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo")); algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
// algorithm_ = nnp_convolution_algorithm_auto;
transform_strategy_ = nnp_convolution_transform_strategy_compute; transform_strategy_ = nnp_convolution_transform_strategy_compute;
nnp_status status = nnp_initialize(); nnp_status status = nnp_initialize();
CHECK_EQ(status, nnp_status_success); CHECK_EQ(status, nnp_status_success);
...@@ -67,8 +65,7 @@ public: ...@@ -67,8 +65,7 @@ public:
} }
} }
virtual void check(const BufferArgs& inputs, void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const BufferArgs& outputs) override {
const TensorShape& input = inputs[0].shape(); const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape(); const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape(); const TensorShape& output = outputs[0].shape();
...@@ -91,8 +88,8 @@ public: ...@@ -91,8 +88,8 @@ public:
size_t filterHeight = getFilterHeight(filter); size_t filterHeight = getFilterHeight(filter);
size_t filterWidth = getFilterWidth(filter); size_t filterWidth = getFilterWidth(filter);
size_t outputChannels = output[1]; size_t outputChannels = output[1];
// size_t outputHeight = output[2]; size_t outputHeight = output[2];
// size_t outputWidth = output[3]; size_t outputWidth = output[3];
nnp_size inputSize = {.width = inputWidth, .height = inputHeight}; nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
nnp_padding padding = {.top = (size_t)paddingH(), nnp_padding padding = {.top = (size_t)paddingH(),
...@@ -171,49 +168,58 @@ public: ...@@ -171,49 +168,58 @@ public:
} }
} }
size_t inputOffset = inputChannels / groups_ * inputHeight * inputWidth;
size_t outputOffset = outputChannels / groups_ * outputHeight * outputWidth;
size_t filterOffset = filter.getElements() / groups_;
if (batchSize == 1) { if (batchSize == 1) {
nnp_status status = for (size_t g = 0; g < groups_; g++) {
nnp_convolution_inference(algorithm_, nnp_status status =
transform_strategy_, nnp_convolution_inference(algorithm_,
inputChannels, transform_strategy_,
outputChannels, inputChannels / groups_,
inputSize, outputChannels / groups_,
padding, inputSize,
kernelSize, padding,
outputSubsampling, kernelSize,
inputData, outputSubsampling,
filterData, inputData + inputOffset * g,
nullptr, /* bias */ filterData + filterOffset * g,
outputData, nullptr, /* bias */
bufferPtr, outputData + outputOffset * g,
sizePtr, bufferPtr,
nnp_activation_identity, sizePtr,
nullptr, nnp_activation_identity,
threadpool_, /* threadpool */ nullptr,
nullptr); threadpool_, /* threadpool */
CHECK_EQ(status, nnp_status_success); nullptr);
CHECK_EQ(status, nnp_status_success);
}
} else { } else {
// only supports stride = 1 for (size_t g = 0; g < groups_; g++) {
CHECK_EQ(strideH(), 1); // only supports stride = 1
CHECK_EQ(strideW(), 1); CHECK_EQ(strideH(), 1);
nnp_status status = nnp_convolution_output(algorithm_, CHECK_EQ(strideW(), 1);
batchSize, nnp_status status =
inputChannels, nnp_convolution_output(algorithm_,
outputChannels, batchSize,
inputSize, inputChannels / groups_,
padding, outputChannels / groups_,
kernelSize, inputSize,
inputData, padding,
filterData, kernelSize,
nullptr, /* bias */ inputData + inputOffset * g,
outputData, filterData + filterOffset * g,
bufferPtr, nullptr, /* bias */
sizePtr, outputData + outputOffset * g,
nnp_activation_identity, bufferPtr,
nullptr, sizePtr,
threadpool_, /* threadpool */ nnp_activation_identity,
nullptr); nullptr,
CHECK_EQ(status, nnp_status_success); threadpool_, /* threadpool */
nullptr);
CHECK_EQ(status, nnp_status_success);
}
} }
} }
......
...@@ -186,7 +186,10 @@ Error __must_check forward(Argument& act) { ...@@ -186,7 +186,10 @@ Error __must_check forward(Argument& act) {
useGpu(act.deviceId)); useGpu(act.deviceId));
} }
auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId)); auto starts =
act.hasSubseq()
? act.subSequenceStartPositions->getVector(useGpu(act.deviceId))
: act.sequenceStartPositions->getVector(useGpu(act.deviceId));
act.value->sequenceSoftmax(*act.value, *starts); act.value->sequenceSoftmax(*act.value, *starts);
return Error(); return Error();
} }
...@@ -197,8 +200,9 @@ Error __must_check backward(Argument& act) { ...@@ -197,8 +200,9 @@ Error __must_check backward(Argument& act) {
"Input width for each timestep of sequence softmax should be 1"); "Input width for each timestep of sequence softmax should be 1");
} }
size_t numSequences = act.getNumSequences(); size_t numSequences =
const int* starts = act.sequenceStartPositions->getData(false); act.hasSubseq() ? act.getNumSubSequences() : act.getNumSequences();
const int* starts = act.getCpuStartPositions();
for (size_t i = 0; i < numSequences; ++i) { for (size_t i = 0; i < numSequences; ++i) {
// TODO(Dangqingqing) optimization for GPU // TODO(Dangqingqing) optimization for GPU
......
...@@ -57,8 +57,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, ...@@ -57,8 +57,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
convGradFilterType = "GemmConvGradFilter"; convGradFilterType = "GemmConvGradFilter";
} }
if (FLAGS_use_nnpack) { if (FLAGS_use_nnpack && !isDeconv_) {
CHECK_EQ(isDeconv_, false);
createFunction(forward_, createFunction(forward_,
"NNPACKConv", "NNPACKConv",
FuncConfig() FuncConfig()
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "GruCompute.h" #include "GruCompute.h"
#include "hl_recurrent_apply.cuh" #include "hl_recurrent_apply.cuh"
...@@ -31,8 +30,10 @@ void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) { ...@@ -31,8 +30,10 @@ void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) {
} }
template <> template <>
void GruCompute::backward<1>(hl_gru_value value, hl_gru_grad grad, void GruCompute::backward<1>(hl_gru_value value,
int frameSize, int batchSize) { hl_gru_grad grad,
int frameSize,
int batchSize) {
hl_gpu_gru_backward(hppl::backward::gru_stateGrad(), hl_gpu_gru_backward(hppl::backward::gru_stateGrad(),
hppl::backward::gru_resetGrad(), hppl::backward::gru_resetGrad(),
value, value,
......
...@@ -12,41 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,41 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "LstmCompute.h" #include "LstmCompute.h"
#include "hl_recurrent_apply.cuh" #include "hl_recurrent_apply.cuh"
namespace paddle { namespace paddle {
template <> template <>
void LstmCompute::forwardBatch<1>(hl_lstm_value value, int frameSize, void LstmCompute::forwardBatch<1>(hl_lstm_value value,
int batchSize) { int frameSize,
hl_gpu_lstm_forward(hppl::forward::lstm(), value, frameSize, int batchSize) {
batchSize, activeNode_, activeGate_, hl_gpu_lstm_forward(hppl::forward::lstm(),
value,
frameSize,
batchSize,
activeNode_,
activeGate_,
activeState_); activeState_);
} }
template <> template <>
void LstmCompute::backwardBatch<1>(hl_lstm_value value, hl_lstm_grad grad, void LstmCompute::backwardBatch<1>(hl_lstm_value value,
int frameSize, int batchSize) { hl_lstm_grad grad,
hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad, int frameSize,
frameSize, batchSize, activeNode_, int batchSize) {
activeGate_, activeState_); hl_gpu_lstm_backward(hppl::backward::lstm(),
value,
grad,
frameSize,
batchSize,
activeNode_,
activeGate_,
activeState_);
} }
template <> template <>
void LstmCompute::forwardOneSequence<1>(hl_lstm_value value, int frameSize) { void LstmCompute::forwardOneSequence<1>(hl_lstm_value value, int frameSize) {
hl_gpu_lstm_forward(hppl::forward::lstm(), value, hl_gpu_lstm_forward(hppl::forward::lstm(),
frameSize, /* batchSize */ 1, value,
activeNode_, activeGate_, activeState_); frameSize,
/* batchSize */ 1,
activeNode_,
activeGate_,
activeState_);
} }
template <> template <>
void LstmCompute::backwardOneSequence<1>(hl_lstm_value value, hl_lstm_grad grad, void LstmCompute::backwardOneSequence<1>(hl_lstm_value value,
hl_lstm_grad grad,
int frameSize) { int frameSize) {
hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad, hl_gpu_lstm_backward(hppl::backward::lstm(),
frameSize, /* batchSize */ 1, value,
activeNode_, activeGate_, activeState_); grad,
frameSize,
/* batchSize */ 1,
activeNode_,
activeGate_,
activeState_);
} }
} // namespace paddle } // namespace paddle
...@@ -29,7 +29,7 @@ public: ...@@ -29,7 +29,7 @@ public:
vals.push_back(s.str()); vals.push_back(s.str());
} }
size_t pos = 0; size_t pos = 0;
int i = 0; size_t i = 0;
std::ostringstream s; std::ostringstream s;
const std::string& format = config_.user_arg(); const std::string& format = config_.user_arg();
while (true) { while (true) {
......
...@@ -50,7 +50,7 @@ add_unittest_without_exec(test_DetectionOutput ...@@ -50,7 +50,7 @@ add_unittest_without_exec(test_DetectionOutput
test_DetectionOutput.cpp test_DetectionOutput.cpp
LayerGradUtil.cpp) LayerGradUtil.cpp)
add_test(NAME test_DetectionOutput add_test(NAME test_DetectionOutput
COMMAND test_DetectionOutput) COMMAND test_DetectionOutput)
################# test_ConvUnify ####################### ################# test_ConvUnify #######################
add_unittest_without_exec(test_ConvUnify add_unittest_without_exec(test_ConvUnify
......
...@@ -57,6 +57,39 @@ TEST(Activation, activation) { ...@@ -57,6 +57,39 @@ TEST(Activation, activation) {
} }
} }
void testSequenceSoftmaxAct(bool hasSubseq) {
LOG(INFO) << "test activation: sequence softmax";
const size_t size = 1;
TestConfig config;
config.biasSize = 0;
config.layerConfig.set_type("addto");
config.layerConfig.set_size(size);
config.layerConfig.set_active_type("sequence_softmax");
config.inputDefs.push_back(
{hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
"layer_0",
1,
0});
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
testLayerGrad(config,
"sequence_softmax",
100,
/* trans= */ false,
useGpu,
/* useWeight */ true);
}
}
TEST(SequenceSoftmaxActivation, activation) {
for (auto hasSubseq : {false, true}) {
LOG(INFO) << "hasSubseq = " << hasSubseq;
testSequenceSoftmaxAct(hasSubseq);
}
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
initMain(argc, argv); initMain(argc, argv);
......
...@@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cmath>
#include <string.h>
#include <paddle/utils/Logging.h> #include <paddle/utils/Logging.h>
#include <string.h>
#include <cmath>
#include "BaseMatrix.h" #include "BaseMatrix.h"
#include "hl_matrix_ops.cuh"
#include "hl_matrix_base.cuh"
#include "hl_matrix_apply.cuh"
#include "SIMDFunctions.h"
#include "MathFunctions.h" #include "MathFunctions.h"
#include "SIMDFunctions.h"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_base.cuh"
#include "hl_matrix_ops.cuh"
namespace paddle { namespace paddle {
const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported."; const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported.";
template<class T> template <class T>
template <class Op> template <class Op>
int BaseMatrixT<T>::applyUnary(Op op) { int BaseMatrixT<T>::applyUnary(Op op) {
MatrixOffset offset(0, 0); MatrixOffset offset(0, 0);
...@@ -34,9 +34,11 @@ int BaseMatrixT<T>::applyUnary(Op op) { ...@@ -34,9 +34,11 @@ int BaseMatrixT<T>::applyUnary(Op op) {
return 0; return 0;
} }
template<class T> template <class T>
template <class Op> template <class Op>
int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols, int BaseMatrixT<T>::applyUnary(Op op,
int numRows,
int numCols,
MatrixOffset& offset) { MatrixOffset& offset) {
CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
int dimM = numRows; int dimM = numRows;
...@@ -56,7 +58,7 @@ int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols, ...@@ -56,7 +58,7 @@ int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
return 0; return 0;
} }
template<class T> template <class T>
template <class Op> template <class Op>
int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) { int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
CHECK(height_ == b.height_ && width_ == b.width_) CHECK(height_ == b.height_ && width_ == b.width_)
...@@ -67,18 +69,23 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) { ...@@ -67,18 +69,23 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
return 0; return 0;
} }
template<class T> template <class T>
template <class Op> template <class Op>
int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, int BaseMatrixT<T>::applyBinary(
MatrixOffset& offset) { Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) {
applyBinary(op, b, numRows, numCols, offset, false_type(), false_type()); applyBinary(op, b, numRows, numCols, offset, false_type(), false_type());
return 0; return 0;
} }
template<class T> template <class T>
template <class Op, class bAsRowVector, class bAsColVector> template <class Op, class bAsRowVector, class bAsColVector>
int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, int BaseMatrixT<T>::applyBinary(Op op,
MatrixOffset& offset, bAsRowVector, bAsColVector) { BaseMatrixT& b,
int numRows,
int numCols,
MatrixOffset& offset,
bAsRowVector,
bAsColVector) {
CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch"; CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch";
...@@ -91,8 +98,8 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, ...@@ -91,8 +98,8 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
T* A = data_; T* A = data_;
T* B = b.data_; T* B = b.data_;
CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, CAL_MATRIX_START_ADDRESS(
offset.bRow_); B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimM + offset.aRow_, this->height_);
CHECK_LE(dimN + offset.aCol_, this->width_); CHECK_LE(dimN + offset.aCol_, this->width_);
if (!bAsRowVector::value && !bAsColVector::value) { if (!bAsRowVector::value && !bAsColVector::value) {
...@@ -115,7 +122,7 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, ...@@ -115,7 +122,7 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
return 0; return 0;
} }
template<class T> template <class T>
template <class Op> template <class Op>
int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) { int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
CHECK_EQ(height_, b.height_); CHECK_EQ(height_, b.height_);
...@@ -129,21 +136,29 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) { ...@@ -129,21 +136,29 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
return 0; return 0;
} }
template<class T> template <class T>
template <class Op> template <class Op>
int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, int BaseMatrixT<T>::applyTernary(Op op,
int numRows, int numCols, BaseMatrixT& b,
BaseMatrixT& c,
int numRows,
int numCols,
MatrixOffset& offset) { MatrixOffset& offset) {
applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type()); applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type());
return 0; return 0;
} }
template<class T> template <class T>
template <class Op, class cAsRowVector, class cAsColVector> template <class Op, class cAsRowVector, class cAsColVector>
int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, int BaseMatrixT<T>::applyTernary(Op op,
int numRows, int numCols, MatrixOffset& offset, BaseMatrixT& b,
cAsRowVector, cAsColVector) { BaseMatrixT& c,
int numRows,
int numCols,
MatrixOffset& offset,
cAsRowVector,
cAsColVector) {
CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
...@@ -160,10 +175,10 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, ...@@ -160,10 +175,10 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
T* B = b.data_; T* B = b.data_;
T* C = c.data_; T* C = c.data_;
CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, CAL_MATRIX_START_ADDRESS(
offset.bRow_); B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, CAL_MATRIX_START_ADDRESS(
offset.cRow_); C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimM + offset.aRow_, this->height_);
CHECK_LE(dimN + offset.aCol_, this->width_); CHECK_LE(dimN + offset.aCol_, this->width_);
...@@ -180,21 +195,21 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, ...@@ -180,21 +195,21 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
} }
if (true == useGpu_) { if (true == useGpu_) {
hl_gpu_apply_ternary_op hl_gpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
<T, Op, cAsRowVector::value, cAsColVector::value>(
op, A, B, C, dimM, dimN, lda, ldb, ldc); op, A, B, C, dimM, dimN, lda, ldb, ldc);
} else { } else {
hl_cpu_apply_ternary_op hl_cpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
<T, Op, cAsRowVector::value, cAsColVector::value>(
op, A, B, C, dimM, dimN, lda, ldb, ldc); op, A, B, C, dimM, dimN, lda, ldb, ldc);
} }
return 0; return 0;
} }
template<class T> template <class T>
template <class Op> template <class Op>
int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, int BaseMatrixT<T>::applyQuaternary(Op op,
BaseMatrixT& b,
BaseMatrixT& c,
BaseMatrixT& d) { BaseMatrixT& d) {
CHECK_EQ(height_, b.height_); CHECK_EQ(height_, b.height_);
CHECK_EQ(width_, b.width_); CHECK_EQ(width_, b.width_);
...@@ -209,10 +224,14 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, ...@@ -209,10 +224,14 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
return 0; return 0;
} }
template<class T> template <class T>
template <class Op> template <class Op>
int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, int BaseMatrixT<T>::applyQuaternary(Op op,
BaseMatrixT& d, int numRows, int numCols, BaseMatrixT& b,
BaseMatrixT& c,
BaseMatrixT& d,
int numRows,
int numCols,
MatrixOffset& offset) { MatrixOffset& offset) {
CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
...@@ -234,12 +253,12 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, ...@@ -234,12 +253,12 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
T* C = c.data_; T* C = c.data_;
T* D = d.data_; T* D = d.data_;
CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, CAL_MATRIX_START_ADDRESS(
offset.bRow_); B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, CAL_MATRIX_START_ADDRESS(
offset.cRow_); C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
CAL_MATRIX_START_ADDRESS(D, d.height_, d.width_, ldd, offset.dCol_, CAL_MATRIX_START_ADDRESS(
offset.dRow_); D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_);
CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimM + offset.aRow_, this->height_);
CHECK_LE(dimN + offset.aCol_, this->width_); CHECK_LE(dimN + offset.aCol_, this->width_);
...@@ -250,22 +269,29 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, ...@@ -250,22 +269,29 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
CHECK_LE(dimM + offset.dRow_, d.height_); CHECK_LE(dimM + offset.dRow_, d.height_);
CHECK_LE(dimN + offset.dCol_, d.width_); CHECK_LE(dimN + offset.dCol_, d.width_);
if (true == useGpu_) { if (true == useGpu_) {
hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
ldc, ldd);
} else { } else {
hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
ldc, ldd);
} }
return 0; return 0;
} }
template<class T> template <class T>
template <class Agg, class Op, class Saver, class aAsRowVector, template <class Agg,
class Op,
class Saver,
class aAsRowVector,
class aAsColVector> class aAsColVector>
int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, int BaseMatrixT<T>::aggregate(Agg agg,
int numRows, int numCols, MatrixOffset& offset, Op op,
aAsRowVector, aAsColVector) { Saver sv,
BaseMatrixT& b,
int numRows,
int numCols,
MatrixOffset& offset,
aAsRowVector,
aAsColVector) {
CHECK_EQ(useGpu_, b.useGpu_); CHECK_EQ(useGpu_, b.useGpu_);
int ld = stride_; int ld = stride_;
...@@ -273,10 +299,10 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, ...@@ -273,10 +299,10 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
T* dst = data_; T* dst = data_;
T* B = b.data_; T* B = b.data_;
CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_, CAL_MATRIX_START_ADDRESS(
offset.aRow_); dst, height_, width_, ld, offset.aCol_, offset.aRow_);
CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, CAL_MATRIX_START_ADDRESS(
offset.bRow_); B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
if (aAsRowVector::value && !aAsColVector::value) { if (aAsRowVector::value && !aAsColVector::value) {
if (useGpu_) { if (useGpu_) {
...@@ -297,12 +323,21 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, ...@@ -297,12 +323,21 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
return 0; return 0;
} }
template<class T> template <class T>
template <class Agg, class Op, class Saver, class aAsRowVector, template <class Agg,
class Op,
class Saver,
class aAsRowVector,
class aAsColVector> class aAsColVector>
int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, int BaseMatrixT<T>::aggregate(Agg agg,
BaseMatrixT& c, int numRows, int numCols, Op op,
MatrixOffset& offset, aAsRowVector, Saver sv,
BaseMatrixT& b,
BaseMatrixT& c,
int numRows,
int numCols,
MatrixOffset& offset,
aAsRowVector,
aAsColVector) { aAsColVector) {
CHECK_EQ(useGpu_, b.useGpu_); CHECK_EQ(useGpu_, b.useGpu_);
CHECK_EQ(useGpu_, c.useGpu_); CHECK_EQ(useGpu_, c.useGpu_);
...@@ -314,28 +349,28 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, ...@@ -314,28 +349,28 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
T* dst = data_; T* dst = data_;
T* B = b.data_; T* B = b.data_;
T* C = c.data_; T* C = c.data_;
CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_, CAL_MATRIX_START_ADDRESS(
offset.aRow_); dst, height_, width_, ld, offset.aCol_, offset.aRow_);
CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, CAL_MATRIX_START_ADDRESS(
offset.bRow_); B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, CAL_MATRIX_START_ADDRESS(
offset.cRow_); C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
if (aAsRowVector::value && !aAsColVector::value) { if (aAsRowVector::value && !aAsColVector::value) {
if (useGpu_) { if (useGpu_) {
hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, hl_gpu_matrix_column_op(
ldb, C, ldc); agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
} else { } else {
hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, hl_cpu_matrix_column_op(
ldb, C, ldc); agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
} }
} else if (!aAsRowVector::value && aAsColVector::value) { } else if (!aAsRowVector::value && aAsColVector::value) {
if (useGpu_) { if (useGpu_) {
hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, hl_gpu_matrix_row_op(
ldb, C, ldc); agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
} else { } else {
hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, hl_cpu_matrix_row_op(
ldb, C, ldc); agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
} }
} else { } else {
LOG(FATAL) << "not supported"; LOG(FATAL) << "not supported";
...@@ -350,15 +385,19 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, ...@@ -350,15 +385,19 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
*/ */
DEFINE_MATRIX_UNARY_OP(Neg, a = -a); DEFINE_MATRIX_UNARY_OP(Neg, a = -a);
template<class T> template <class T>
void BaseMatrixT<T>::neg() { applyUnary(unary::Neg<T>()); } void BaseMatrixT<T>::neg() {
applyUnary(unary::Neg<T>());
}
DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a)); DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
template<> template <>
void BaseMatrixT<real>::exp2() { applyUnary(unary::Exp<real>()); } void BaseMatrixT<real>::exp2() {
applyUnary(unary::Exp<real>());
}
DEFINE_MATRIX_UNARY_OP(Log, a = log(a)); DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
template<> template <>
void BaseMatrixT<real>::log2() { void BaseMatrixT<real>::log2() {
if (useGpu_) { if (useGpu_) {
applyUnary(unary::Log<real>()); applyUnary(unary::Log<real>());
...@@ -368,30 +407,42 @@ void BaseMatrixT<real>::log2() { ...@@ -368,30 +407,42 @@ void BaseMatrixT<real>::log2() {
} }
DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a)); DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
template<> template <>
void BaseMatrixT<real>::sqrt2() { applyUnary(unary::Sqrt<real>()); } void BaseMatrixT<real>::sqrt2() {
applyUnary(unary::Sqrt<real>());
}
DEFINE_MATRIX_UNARY_OP(Square, a = a * a); DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
template<class T> template <class T>
void BaseMatrixT<T>::square2() { applyUnary(unary::Square<T>()); } void BaseMatrixT<T>::square2() {
applyUnary(unary::Square<T>());
}
DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a); DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
template<class T> template <class T>
void BaseMatrixT<T>::reciprocal2() { applyUnary(unary::Reciprocal<T>()); } void BaseMatrixT<T>::reciprocal2() {
applyUnary(unary::Reciprocal<T>());
}
DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a); DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
template<class T> template <class T>
void BaseMatrixT<T>::abs2() { applyUnary(unary::Abs<T>()); } void BaseMatrixT<T>::abs2() {
applyUnary(unary::Abs<T>());
}
DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0)); DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
template<class T> template <class T>
void BaseMatrixT<T>::sign2() { applyUnary(unary::Sign<T>()); } void BaseMatrixT<T>::sign2() {
applyUnary(unary::Sign<T>());
}
DEFINE_MATRIX_UNARY_OP(Zero, a = 0); DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
template<class T> template <class T>
void BaseMatrixT<T>::zero() { applyUnary(unary::Zero<T>()); } void BaseMatrixT<T>::zero() {
applyUnary(unary::Zero<T>());
}
template<class T> template <class T>
void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) { void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
int numRows = height_; int numRows = height_;
int numCols = numColumns; int numCols = numColumns;
...@@ -400,11 +451,13 @@ void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) { ...@@ -400,11 +451,13 @@ void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
} }
DEFINE_MATRIX_UNARY_OP(One, a = 1); DEFINE_MATRIX_UNARY_OP(One, a = 1);
template<class T> template <class T>
void BaseMatrixT<T>::one() { applyUnary(unary::One<T>()); } void BaseMatrixT<T>::one() {
applyUnary(unary::One<T>());
}
DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p)); DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
template<> template <>
void BaseMatrixT<real>::pow2(real p) { void BaseMatrixT<real>::pow2(real p) {
if (useGpu_) { if (useGpu_) {
applyUnary(unary::Pow<real>(p)); applyUnary(unary::Pow<real>(p));
...@@ -414,51 +467,67 @@ void BaseMatrixT<real>::pow2(real p) { ...@@ -414,51 +467,67 @@ void BaseMatrixT<real>::pow2(real p) {
} }
DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p); DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p);
template<class T> template <class T>
void BaseMatrixT<T>::subScalar(T p) { applyUnary(unary::SubScalar<T>(p)); } void BaseMatrixT<T>::subScalar(T p) {
applyUnary(unary::SubScalar<T>(p));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p); DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p);
template<class T> template <class T>
void BaseMatrixT<T>::mulScalar(T p) { applyUnary(unary::MulScalar<T>(p)); } void BaseMatrixT<T>::mulScalar(T p) {
applyUnary(unary::MulScalar<T>(p));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p); DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p);
template<class T> template <class T>
void BaseMatrixT<T>::divScalar(T p) { applyUnary(unary::DivScalar<T>(p)); } void BaseMatrixT<T>::divScalar(T p) {
applyUnary(unary::DivScalar<T>(p));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p); DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p);
template<class T> template <class T>
void BaseMatrixT<T>::assign(T p) { applyUnary(unary::Assign<T>(p)); } void BaseMatrixT<T>::assign(T p) {
applyUnary(unary::Assign<T>(p));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p); DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p);
template<class T> template <class T>
void BaseMatrixT<T>::add(T p) { applyUnary(unary::Add<T>(p)); } void BaseMatrixT<T>::add(T p) {
applyUnary(unary::Add<T>(p));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2); DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2);
template<class T> template <class T>
void BaseMatrixT<T>::add(T p1, T p2) { applyUnary(unary::Add2<T>(p1, p2)); } void BaseMatrixT<T>::add(T p1, T p2) {
applyUnary(unary::Add2<T>(p1, p2));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER, DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip,
TWO_PARAMETER,
a = a < p1 ? p1 : (a > p2 ? p2 : a)); a = a < p1 ? p1 : (a > p2 ? p2 : a));
template<class T> template <class T>
void BaseMatrixT<T>::clip(T p1, T p2) { applyUnary(unary::Clip<T>(p1, p2)); } void BaseMatrixT<T>::clip(T p1, T p2) {
applyUnary(unary::Clip<T>(p1, p2));
}
DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, TWO_PARAMETER, DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative,
a = b < p1 ? 0 : (b > p2 ? 0 : 1)); TWO_PARAMETER,
template<class T> a = b < p1 ? 0 : (b > p2 ? 0 : 1));
template <class T>
void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) { void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
applyBinary(binary::ClipDerivative<T>(p1, p2), b); applyBinary(binary::ClipDerivative<T>(p1, p2), b);
} }
DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER, DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar,
ONE_PARAMETER,
a = a > p ? 1.0f : 0.0f); a = a > p ? 1.0f : 0.0f);
template<class T> template <class T>
void BaseMatrixT<T>::biggerThanScalar(T p) { void BaseMatrixT<T>::biggerThanScalar(T p) {
applyUnary(unary::BiggerThanScalar<T>(p)); applyUnary(unary::BiggerThanScalar<T>(p));
} }
DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p);
a = a > p ? a : p); template <class T>
template<class T>
void BaseMatrixT<T>::downClip(T p) { void BaseMatrixT<T>::downClip(T p) {
applyUnary(unary::DownClip<T>(p)); applyUnary(unary::DownClip<T>(p));
} }
...@@ -469,12 +538,12 @@ void BaseMatrixT<T>::downClip(T p) { ...@@ -469,12 +538,12 @@ void BaseMatrixT<T>::downClip(T p) {
*/ */
DEFINE_MATRIX_BINARY_OP(Add, a += b); DEFINE_MATRIX_BINARY_OP(Add, a += b);
template<class T> template <class T>
void BaseMatrixT<T>::add(BaseMatrixT& b) { void BaseMatrixT<T>::add(BaseMatrixT& b) {
applyBinary(binary::Add<T>(), b); applyBinary(binary::Add<T>(), b);
} }
template<> template <>
void BaseMatrixT<real>::add(BaseMatrixT& b) { void BaseMatrixT<real>::add(BaseMatrixT& b) {
if (useGpu_) { if (useGpu_) {
applyBinary(binary::Add<real>(), b); applyBinary(binary::Add<real>(), b);
...@@ -485,7 +554,7 @@ void BaseMatrixT<real>::add(BaseMatrixT& b) { ...@@ -485,7 +554,7 @@ void BaseMatrixT<real>::add(BaseMatrixT& b) {
} }
} }
template<class T> template <class T>
void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) { void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
if (columnOffset + b.width_ <= width_) { if (columnOffset + b.width_ <= width_) {
int numRows = height_; int numRows = height_;
...@@ -504,43 +573,53 @@ void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) { ...@@ -504,43 +573,53 @@ void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
} }
} }
template<class T> template <class T>
void BaseMatrixT<T>::addP2P(BaseMatrixT& b) { void BaseMatrixT<T>::addP2P(BaseMatrixT& b) {
T* A = data_; T* A = data_;
T* B = b.data_; T* B = b.data_;
int dimM = height_; int dimM = height_;
int dimN = width_; int dimN = width_;
hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0> hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>(
(binary::Add<T>(), A, B, dimM, dimN, dimN, dimN); binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
} }
template<class T> template <class T>
void BaseMatrixT<T>::addColVector(BaseMatrixT& b) { void BaseMatrixT<T>::addColVector(BaseMatrixT& b) {
MatrixOffset offset(0, 0, 0, 0); MatrixOffset offset(0, 0, 0, 0);
int numRows = height_; int numRows = height_;
int numCols = width_; int numCols = width_;
applyBinary(binary::Add<T>(), b, numRows, numCols, offset, false_type(), applyBinary(binary::Add<T>(),
b,
numRows,
numCols,
offset,
false_type(),
true_type() /* bAsColVector */); true_type() /* bAsColVector */);
} }
template<class T> template <class T>
void BaseMatrixT<T>::addRowVector(BaseMatrixT& b) { void BaseMatrixT<T>::addRowVector(BaseMatrixT& b) {
MatrixOffset offset(0, 0, 0, 0); MatrixOffset offset(0, 0, 0, 0);
int numRows = height_; int numRows = height_;
int numCols = width_; int numCols = width_;
applyBinary(binary::Add<T>(), b, numRows, numCols, offset, applyBinary(binary::Add<T>(),
true_type() /* bAsRowVector */, false_type()); b,
numRows,
numCols,
offset,
true_type() /* bAsRowVector */,
false_type());
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p); DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p);
template<class T> template <class T>
void BaseMatrixT<T>::add(BaseMatrixT& b, T p) { void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
applyBinary(binary::Add1<T>(p), b); applyBinary(binary::Add1<T>(p), b);
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p)); DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
template<> template <>
void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) { void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
if (useGpu_) { if (useGpu_) {
applyBinary(binary::Pow<real>(p), b); applyBinary(binary::Pow<real>(p), b);
...@@ -550,36 +629,45 @@ void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) { ...@@ -550,36 +629,45 @@ void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b); DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b);
template<class T> template <class T>
void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, T p2) { void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, T p2) {
applyBinary(binary::Add2<T>(p1, p2), b); applyBinary(binary::Add2<T>(p1, p2), b);
} }
template<class T> template <class T>
void BaseMatrixT<T>::addBias(BaseMatrixT& b, T scale) { void BaseMatrixT<T>::addBias(BaseMatrixT& b, T scale) {
MatrixOffset offset(0, 0, 0, 0); MatrixOffset offset(0, 0, 0, 0);
int numRows = height_; int numRows = height_;
int numCols = width_; int numCols = width_;
applyBinary(binary::Add1<T>(scale), b, numRows, numCols, offset, applyBinary(binary::Add1<T>(scale),
true_type() /* bAsRowVector */, false_type()); b,
numRows,
numCols,
offset,
true_type() /* bAsRowVector */,
false_type());
} }
DEFINE_MATRIX_BINARY_OP(Sub, a -= b); DEFINE_MATRIX_BINARY_OP(Sub, a -= b);
template<class T> template <class T>
void BaseMatrixT<T>::sub(BaseMatrixT& b) { applyBinary(binary::Sub<T>(), b); } void BaseMatrixT<T>::sub(BaseMatrixT& b) {
applyBinary(binary::Sub<T>(), b);
}
DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p); DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p);
template<class T> template <class T>
void BaseMatrixT<T>::sub(BaseMatrixT& b, T p) { void BaseMatrixT<T>::sub(BaseMatrixT& b, T p) {
applyBinary(binary::Sub1<T>(p), b); applyBinary(binary::Sub1<T>(p), b);
} }
DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f); DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f);
template<class T> template <class T>
void BaseMatrixT<T>::relu(BaseMatrixT& b) { applyBinary(binary::Relu<T>(), b); } void BaseMatrixT<T>::relu(BaseMatrixT& b) {
applyBinary(binary::Relu<T>(), b);
}
DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f)); DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
template<class T> template <class T>
void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) { void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
applyBinary(binary::ReluDerivative<T>(), b); applyBinary(binary::ReluDerivative<T>(), b);
} }
...@@ -589,7 +677,7 @@ DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0; ...@@ -589,7 +677,7 @@ DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
? THRESHOLD ? THRESHOLD
: ((a < -THRESHOLD) ? (-THRESHOLD) : ((a < -THRESHOLD) ? (-THRESHOLD)
: a)))); : a))));
template<> template <>
void BaseMatrixT<real>::softrelu(BaseMatrixT& b) { void BaseMatrixT<real>::softrelu(BaseMatrixT& b) {
applyBinary(binary::Softrelu<real>(), b); applyBinary(binary::Softrelu<real>(), b);
} }
...@@ -599,97 +687,100 @@ DEFINE_MATRIX_BINARY_OP( ...@@ -599,97 +687,100 @@ DEFINE_MATRIX_BINARY_OP(
a *= (1.0 - exp(-1.0 * ((b > THRESHOLD) a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
? THRESHOLD ? THRESHOLD
: ((b < -THRESHOLD) ? (-THRESHOLD) : b))))); : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
template<> template <>
void BaseMatrixT<real>::softreluDerivative(BaseMatrixT& b) { void BaseMatrixT<real>::softreluDerivative(BaseMatrixT& b) {
applyBinary(binary::SoftreluDerivative<real>(), b); applyBinary(binary::SoftreluDerivative<real>(), b);
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1; DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1;
b = b < p2 ? b : p2); b = b < p2 ? b : p2);
template<class T> template <class T>
void BaseMatrixT<T>::brelu(BaseMatrixT& b) { void BaseMatrixT<T>::brelu(BaseMatrixT& b) {
int p1 = 0, p2 = 24; //! TODO(yuyang18): Make p1,p2 configuable. int p1 = 0, p2 = 24; //! TODO(yuyang18): Make p1,p2 configuable.
applyBinary(binary::Brelu<T>(p1, p2), b); applyBinary(binary::Brelu<T>(p1, p2), b);
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative, TWO_PARAMETER, DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative,
TWO_PARAMETER,
a *= (b > p1 && b < p2) ? 1.0 : 0.0); a *= (b > p1 && b < p2) ? 1.0 : 0.0);
template<class T> template <class T>
void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) { void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
int p1 = 0, p2 = 24; int p1 = 0, p2 = 24;
applyBinary(binary::BreluDerivative<T>(p1, p2), b); applyBinary(binary::BreluDerivative<T>(p1, p2), b);
} }
DEFINE_MATRIX_BINARY_OP(Square, b = a * a); DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
template<class T> template <class T>
void BaseMatrixT<T>::square2(BaseMatrixT& b) { void BaseMatrixT<T>::square2(BaseMatrixT& b) {
applyBinary(binary::Square<T>(), b); applyBinary(binary::Square<T>(), b);
} }
DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b); DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b);
template<class T> template <class T>
void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) { void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
applyBinary(binary::SquareDerivative<T>(), b); applyBinary(binary::SquareDerivative<T>(), b);
} }
DEFINE_MATRIX_BINARY_OP(Tanh, DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a;
T tmp = -2.0 * a; tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
b = 2.0 / (1.0 + std::exp(tmp)) - 1.0); template <>
template<>
void BaseMatrixT<real>::tanh(BaseMatrixT& b) { void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
applyBinary(binary::Tanh<real>(), b); applyBinary(binary::Tanh<real>(), b);
} }
DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b); DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b);
template<class T> template <class T>
void BaseMatrixT<T>::tanhDerivative(BaseMatrixT& b) { void BaseMatrixT<T>::tanhDerivative(BaseMatrixT& b) {
applyBinary(binary::TanhDerivative<T>(), b); applyBinary(binary::TanhDerivative<T>(), b);
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanh, TWO_PARAMETER, DEFINE_MATRIX_BINARY_PARAMETER_OP(
b = p1 * ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
(2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)); template <>
template<>
void BaseMatrixT<real>::scaledTanh(BaseMatrixT& b, real p1, real p2) { void BaseMatrixT<real>::scaledTanh(BaseMatrixT& b, real p1, real p2) {
applyBinary(binary::ScaledTanh<real>(p1, p2), b); applyBinary(binary::ScaledTanh<real>(p1, p2), b);
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative, TWO_PARAMETER, DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative,
TWO_PARAMETER,
a *= p2 * (p1 - b * b)); a *= p2 * (p1 - b * b));
template<class T> template <class T>
void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) { void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
applyBinary(binary::ScaledTanhDerivative<T>(p1 * p1, p2 / p1), b); applyBinary(binary::ScaledTanhDerivative<T>(p1 * p1, p2 / p1), b);
} }
DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a); DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
template<class T> template <class T>
void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) { void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
applyBinary(binary::Reciprocal<T>(), b); applyBinary(binary::Reciprocal<T>(), b);
} }
DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b); DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b);
template<class T> template <class T>
void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) { void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
applyBinary(binary::ReciprocalDerivative<T>(), b); applyBinary(binary::ReciprocalDerivative<T>(), b);
} }
DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a); DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
template<class T> template <class T>
void BaseMatrixT<T>::abs2(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); } void BaseMatrixT<T>::abs2(BaseMatrixT& b) {
applyBinary(binary::Abs<T>(), b);
}
DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0); DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
template<class T> template <class T>
void BaseMatrixT<T>::absDerivative(BaseMatrixT& b) { void BaseMatrixT<T>::absDerivative(BaseMatrixT& b) {
applyBinary(binary::AbsDerivative<T>(), b); applyBinary(binary::AbsDerivative<T>(), b);
} }
DEFINE_MATRIX_BINARY_OP( DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0;
Sigmoid, const T THRESHOLD_MIN = -40.0; const T THRESHOLD_MAX = 13.0; const T THRESHOLD_MAX = 13.0;
T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN T tmp = (a < THRESHOLD_MIN)
: ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a); ? THRESHOLD_MIN
b = 1.0f / (1.0f + exp(-tmp))); : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
template<> b = 1.0f / (1.0f + exp(-tmp)));
template <>
void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) { void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
if (useGpu_) { if (useGpu_) {
applyBinary(binary::Sigmoid<real>(), b); applyBinary(binary::Sigmoid<real>(), b);
...@@ -723,31 +814,31 @@ void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) { ...@@ -723,31 +814,31 @@ void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
} }
DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b)); DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b));
template<class T> template <class T>
void BaseMatrixT<T>::sigmoidDerivative(BaseMatrixT& b) { void BaseMatrixT<T>::sigmoidDerivative(BaseMatrixT& b) {
applyBinary(binary::SigmoidDerivative<T>(), b); applyBinary(binary::SigmoidDerivative<T>(), b);
} }
DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b); DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b);
template<class T> template <class T>
void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) { void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
applyBinary(binary::ExpDerivative<T>(), b); applyBinary(binary::ExpDerivative<T>(), b);
} }
DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f); DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
template<class T> template <class T>
void BaseMatrixT<T>::sign2(BaseMatrixT& b) { void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
applyBinary(binary::Sign<T>(), b); applyBinary(binary::Sign<T>(), b);
} }
DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b)); DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
template<> template <>
void BaseMatrixT<real>::exp2(BaseMatrixT& b) { void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
applyBinary(binary::Exp<real>(), b); applyBinary(binary::Exp<real>(), b);
} }
DEFINE_MATRIX_BINARY_OP(Log, a = log(b)); DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
template<> template <>
void BaseMatrixT<real>::log2(BaseMatrixT& b) { void BaseMatrixT<real>::log2(BaseMatrixT& b) {
if (useGpu_) { if (useGpu_) {
applyBinary(binary::Log<real>(), b); applyBinary(binary::Log<real>(), b);
...@@ -757,13 +848,13 @@ void BaseMatrixT<real>::log2(BaseMatrixT& b) { ...@@ -757,13 +848,13 @@ void BaseMatrixT<real>::log2(BaseMatrixT& b) {
} }
DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b)); DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
template<> template <>
void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) { void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
applyBinary(binary::Sqrt<real>(), b); applyBinary(binary::Sqrt<real>(), b);
} }
DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b)); DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b));
template<> template <>
void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) { void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
if (useGpu_) { if (useGpu_) {
applyBinary(binary::InvSqrt<real>(), b); applyBinary(binary::InvSqrt<real>(), b);
...@@ -775,37 +866,37 @@ void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) { ...@@ -775,37 +866,37 @@ void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p)); DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p));
template<class T> template <class T>
void BaseMatrixT<T>::isEqualTo(BaseMatrixT& b, T value) { void BaseMatrixT<T>::isEqualTo(BaseMatrixT& b, T value) {
applyBinary(binary::IsEqual<T>(value), b); applyBinary(binary::IsEqual<T>(value), b);
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p); DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p);
template<class T> template <class T>
void BaseMatrixT<T>::addScalar(BaseMatrixT& b, T p) { void BaseMatrixT<T>::addScalar(BaseMatrixT& b, T p) {
applyBinary(binary::AddScalar<T>(p), b); applyBinary(binary::AddScalar<T>(p), b);
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p); DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p);
template<class T> template <class T>
void BaseMatrixT<T>::subScalar(BaseMatrixT& b, T p) { void BaseMatrixT<T>::subScalar(BaseMatrixT& b, T p) {
applyBinary(binary::SubScalar<T>(p), b); applyBinary(binary::SubScalar<T>(p), b);
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p); DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p);
template<class T> template <class T>
void BaseMatrixT<T>::mulScalar(BaseMatrixT& b, T p) { void BaseMatrixT<T>::mulScalar(BaseMatrixT& b, T p) {
applyBinary(binary::MulScalar<T>(p), b); applyBinary(binary::MulScalar<T>(p), b);
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p); DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p);
template<class T> template <class T>
void BaseMatrixT<T>::divScalar(BaseMatrixT& b, T p) { void BaseMatrixT<T>::divScalar(BaseMatrixT& b, T p) {
applyBinary(binary::DivScalar<T>(p), b); applyBinary(binary::DivScalar<T>(p), b);
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b); DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b);
template<class T> template <class T>
void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) { void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
applyBinary(binary::ScalarDiv<T>(p), b); applyBinary(binary::ScalarDiv<T>(p), b);
} }
...@@ -817,20 +908,20 @@ void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) { ...@@ -817,20 +908,20 @@ void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy, DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy,
a = -c * log(b) - (1 - c) * log(1 - b)); a = -c * log(b) - (1 - c) * log(1 - b));
template<> template <>
void BaseMatrixT<real>::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<real>::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::SoftCrossEntropy<real>(), b, c); applyTernary(ternary::SoftCrossEntropy<real>(), b, c);
} }
DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b))); DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b)));
template<class T> template <class T>
void BaseMatrixT<T>::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::SoftCrossEntropyBp<T>(), b, c); applyTernary(ternary::SoftCrossEntropyBp<T>(), b, c);
} }
DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy, DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy,
a = c > 0.5 ? -log(b) : -log(1.0 - b)); a = c > 0.5 ? -log(b) : -log(1.0 - b));
template<> template <>
void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b, void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
BaseMatrixT& c) { BaseMatrixT& c) {
if (useGpu_) { if (useGpu_) {
...@@ -858,70 +949,73 @@ void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b, ...@@ -858,70 +949,73 @@ void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp, DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp,
a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)); a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b));
template<class T> template <class T>
void BaseMatrixT<T>::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::BinaryCrossEntropyBp<T>(), b, c); applyTernary(ternary::BinaryCrossEntropyBp<T>(), b, c);
} }
DEFINE_MATRIX_TERNARY_OP(Add, a = b + c); DEFINE_MATRIX_TERNARY_OP(Add, a = b + c);
template<class T> template <class T>
void BaseMatrixT<T>::add(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::add(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::Add<T>(), b, c); applyTernary(ternary::Add<T>(), b, c);
} }
DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c); DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c);
template<class T> template <class T>
void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) { void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
applyTernary(ternary::Add1<T>(p1, p2), b, c); applyTernary(ternary::Add1<T>(p1, p2), b, c);
} }
DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c); DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c);
template<class T> template <class T>
void BaseMatrixT<T>::sub(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::sub(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::Sub<T>(), b, c); applyTernary(ternary::Sub<T>(), b, c);
} }
DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c); DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c);
template<class T> template <class T>
void BaseMatrixT<T>::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) { void BaseMatrixT<T>::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
applyTernary(ternary::Sub1<T>(p1, p2), b, c); applyTernary(ternary::Sub1<T>(p1, p2), b, c);
} }
DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c); DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c);
template<class T> template <class T>
void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::Add2<T>(), b, c); applyTernary(ternary::Add2<T>(), b, c);
} }
DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3, THREE_PARAMETER, DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3,
THREE_PARAMETER,
a = p1 * a + p2 * b + p3 * c); a = p1 * a + p2 * b + p3 * c);
template<class T> template <class T>
void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) { void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
applyTernary(ternary::Add3<T>(p1, p2, p3), b, c); applyTernary(ternary::Add3<T>(p1, p2, p3), b, c);
} }
DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER, DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate,
THREE_PARAMETER,
c = p2 * c - p1 * (b + p3 * a); c = p2 * c - p1 * (b + p3 * a);
a = a + c); a = a + c);
template<class T> template <class T>
void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b, // grad void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b, // grad
BaseMatrixT& c, // mom BaseMatrixT& c, // mom
T p1, // learningRate, T p1, // learningRate,
T p2, // momentum, T p2, // momentum,
T p3) { // decayRate T p3) { // decayRate
applyTernary(ternary::SgdUpdate<T>(p1, p2, p3), b, c); applyTernary(ternary::SgdUpdate<T>(p1, p2, p3), b, c);
} }
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER, DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate,
THREE_PARAMETER,
c = p2 * c - p1 * d * (b + p3 * a); c = p2 * c - p1 * d * (b + p3 * a);
a += c); a += c);
template<class T> template <class T>
void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b, // grad, void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b, // grad,
BaseMatrixT& c, // mom, BaseMatrixT& c, // mom,
BaseMatrixT& d, // lr, BaseMatrixT& d, // lr,
T p1, // learningRate, T p1, // learningRate,
T p2, // momentum, T p2, // momentum,
T p3) { // decayRate T p3) { // decayRate
applyQuaternary(quaternary::SgdUpdate<T>(p1, p2, p3), b, c, d); applyQuaternary(quaternary::SgdUpdate<T>(p1, p2, p3), b, c, d);
} }
...@@ -929,19 +1023,22 @@ DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b; ...@@ -929,19 +1023,22 @@ DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
a = (a > lambda) a = (a > lambda)
? (a - lambda) ? (a - lambda)
: (a < -lambda) ? (a + lambda) : 0); : (a < -lambda) ? (a + lambda) : 0);
template<class T> template <class T>
void BaseMatrixT<T>::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) { void BaseMatrixT<T>::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) {
applyBinary(binary::ApplyL1<T>(learningRate * decayRate), lr); applyBinary(binary::ApplyL1<T>(learningRate * decayRate), lr);
} }
template<> template <>
void BaseMatrixT<real>::applyL1(BaseMatrixT& lr, void BaseMatrixT<real>::applyL1(BaseMatrixT& lr,
real learningRate, real learningRate,
real decayRate) { real decayRate) {
if (useGpu_) { if (useGpu_) {
applyBinary(binary::ApplyL1<real>(learningRate * decayRate), lr); applyBinary(binary::ApplyL1<real>(learningRate * decayRate), lr);
} else { } else {
simd::decayL1(this->data_, this->data_, lr.data_, learningRate * decayRate, simd::decayL1(this->data_,
this->data_,
lr.data_,
learningRate * decayRate,
height_ * width_); height_ * width_);
} }
} }
...@@ -950,24 +1047,25 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p; ...@@ -950,24 +1047,25 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
a = (a > lambda) a = (a > lambda)
? (a - lambda) ? (a - lambda)
: (a < -lambda) ? (a + lambda) : 0); : (a < -lambda) ? (a + lambda) : 0);
template<class T> template <class T>
void BaseMatrixT<T>::applyL1(T learningRate, T decayRate) { void BaseMatrixT<T>::applyL1(T learningRate, T decayRate) {
applyUnary(unary::ApplyL1<T>(learningRate * decayRate)); applyUnary(unary::ApplyL1<T>(learningRate * decayRate));
} }
template<> template <>
void BaseMatrixT<real>::applyL1(real learningRate, real decayRate) { void BaseMatrixT<real>::applyL1(real learningRate, real decayRate) {
if (useGpu_) { if (useGpu_) {
applyUnary(unary::ApplyL1<real>(learningRate * decayRate)); applyUnary(unary::ApplyL1<real>(learningRate * decayRate));
} else { } else {
simd::decayL1(this->data_, this->data_, learningRate * decayRate, simd::decayL1(
height_ * width_); this->data_, this->data_, learningRate * decayRate, height_ * width_);
} }
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2, ONE_PARAMETER, DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2,
ONE_PARAMETER,
a *= (1.0f / (1.0f + p * b))); a *= (1.0f / (1.0f + p * b)));
template<class T> template <class T>
void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) { void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
if (useGpu_) { if (useGpu_) {
applyBinary(binary::ApplyL2<T>(learningRate * decayRate), lr); applyBinary(binary::ApplyL2<T>(learningRate * decayRate), lr);
...@@ -980,32 +1078,33 @@ void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) { ...@@ -980,32 +1078,33 @@ void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
} }
} }
template<class T> template <class T>
void BaseMatrixT<T>::applyL2(T learningRate, T decayRate) { void BaseMatrixT<T>::applyL2(T learningRate, T decayRate) {
BaseMatrixT<T>::mulScalar(1.0f / (1.0f + learningRate * decayRate)); BaseMatrixT<T>::mulScalar(1.0f / (1.0f + learningRate * decayRate));
} }
DEFINE_MATRIX_BINARY_OP(DotMul, a *= b); DEFINE_MATRIX_BINARY_OP(DotMul, a *= b);
template<class T> template <class T>
void BaseMatrixT<T>::dotMul(BaseMatrixT& b) { void BaseMatrixT<T>::dotMul(BaseMatrixT& b) {
applyBinary(binary::DotMul<T>(), b); applyBinary(binary::DotMul<T>(), b);
} }
DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c); DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c);
template<class T> template <class T>
void BaseMatrixT<T>::dotMul(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::dotMul(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::DotMul<T>(), b, c); applyTernary(ternary::DotMul<T>(), b, c);
} }
DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c); DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c);
template<class T> template <class T>
void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::DotDiv<T>(), b, c); applyTernary(ternary::DotDiv<T>(), b, c);
} }
DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P, TWO_PARAMETER, DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P,
TWO_PARAMETER,
a = (b + p1) / (c + p2)); a = (b + p1) / (c + p2));
template<class T> template <class T>
void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
applyTernary(ternary::DotDiv2P<T>(p1, p2), b, c); applyTernary(ternary::DotDiv2P<T>(p1, p2), b, c);
} }
...@@ -1015,7 +1114,7 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c; ...@@ -1015,7 +1114,7 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
? THRESHOLD ? THRESHOLD
: ((a < -THRESHOLD) ? (-THRESHOLD) : a); : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
a = log(1 + exp(a)) - a * d); a = log(1 + exp(a)) - a * d);
template<> template <>
void BaseMatrixT<real>::rankLoss(BaseMatrixT& b, void BaseMatrixT<real>::rankLoss(BaseMatrixT& b,
BaseMatrixT& c, BaseMatrixT& c,
BaseMatrixT& d) { BaseMatrixT& d) {
...@@ -1026,8 +1125,9 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c; ...@@ -1026,8 +1125,9 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
a = (a > THRESHOLD) a = (a > THRESHOLD)
? THRESHOLD ? THRESHOLD
: ((a < -THRESHOLD) ? (-THRESHOLD) : a); : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
a = exp(a); a = (a / (1 + a) - d)); a = exp(a);
template<> a = (a / (1 + a) - d));
template <>
void BaseMatrixT<real>::rankLossBp(BaseMatrixT& b, void BaseMatrixT<real>::rankLossBp(BaseMatrixT& b,
BaseMatrixT& c, BaseMatrixT& c,
BaseMatrixT& d) { BaseMatrixT& d) {
...@@ -1040,7 +1140,7 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0; ...@@ -1040,7 +1140,7 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
? -THRESHOLD ? -THRESHOLD
: b; : b;
a = log(1 + exp(x)) - c * x); a = log(1 + exp(x)) - c * x);
template<> template <>
void BaseMatrixT<real>::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<real>::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::LogisticRegressionLoss<real>(), b, c); applyTernary(ternary::LogisticRegressionLoss<real>(), b, c);
} }
...@@ -1050,22 +1150,23 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0; ...@@ -1050,22 +1150,23 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD) T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
? -THRESHOLD ? -THRESHOLD
: b; : b;
x = exp(x); a = x / (1 + x) - c); x = exp(x);
template<> a = x / (1 + x) - c);
template <>
void BaseMatrixT<real>::logisticRegressionLossBp(BaseMatrixT& b, void BaseMatrixT<real>::logisticRegressionLossBp(BaseMatrixT& b,
BaseMatrixT& c) { BaseMatrixT& c) {
applyTernary(ternary::LogisticRegressionLossBp<real>(), b, c); applyTernary(ternary::LogisticRegressionLossBp<real>(), b, c);
} }
DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f); DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f);
template<class T> template <class T>
void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::BiggerThan<T>(), b, c); applyTernary(ternary::BiggerThan<T>(), b, c);
} }
DEFINE_MATRIX_QUATERNARY_OP( DEFINE_MATRIX_QUATERNARY_OP(
BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f); BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
template<class T> template <class T>
void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
BaseMatrixT& c, BaseMatrixT& c,
BaseMatrixT& d) { BaseMatrixT& d) {
...@@ -1073,25 +1174,34 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, ...@@ -1073,25 +1174,34 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
} }
DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c); DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
template<class T> template <class T>
void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::Max<T>(), b, c); applyTernary(ternary::Max<T>(), b, c);
} }
DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError, ONE_PARAMETER, DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError,
ONE_PARAMETER,
c += ((a > p) == (b > p)) ? 0.0f : 1.0f); c += ((a > p) == (b > p)) ? 0.0f : 1.0f);
template<class T> template <class T>
void BaseMatrixT<T>::binaryClassificationError2(size_t destCol, BaseMatrixT& b, void BaseMatrixT<T>::binaryClassificationError2(size_t destCol,
BaseMatrixT& c, T p) { BaseMatrixT& b,
BaseMatrixT& c,
T p) {
CHECK(!useGpu_) << "do not support gpu"; CHECK(!useGpu_) << "do not support gpu";
MatrixOffset offset(0, 0, 0, 0, destCol, 0); MatrixOffset offset(0, 0, 0, 0, destCol, 0);
int numRows = b.height_; int numRows = b.height_;
int numCols = b.width_; int numCols = b.width_;
b.applyTernary(ternary::BinaryClassificationError<T>(p), c, *this, numRows, b.applyTernary(ternary::BinaryClassificationError<T>(p),
numCols, offset, false_type(), true_type() /*cAsColVector*/); c,
*this,
numRows,
numCols,
offset,
false_type(),
true_type() /*cAsColVector*/);
} }
template<> template <>
void BaseMatrixT<real>::binaryClassificationError(size_t destCol, void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
BaseMatrixT& b, BaseMatrixT& b,
BaseMatrixT& c, BaseMatrixT& c,
...@@ -1099,127 +1209,148 @@ void BaseMatrixT<real>::binaryClassificationError(size_t destCol, ...@@ -1099,127 +1209,148 @@ void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
MatrixOffset offset(destCol, 0, 0, 0, 0, 0); MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
int numRows = b.height_; int numRows = b.height_;
int numCols = b.width_; int numCols = b.width_;
aggregate(aggregate::sum(), base::binary::classificationError(p), aggregate(aggregate::sum(),
base::binary::add(), b, c, numRows, numCols, offset, false_type(), base::binary::classificationError(p),
base::binary::add(),
b,
c,
numRows,
numCols,
offset,
false_type(),
true_type() /*aAsColVector*/); true_type() /*aAsColVector*/);
} }
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3, THREE_PARAMETER, DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3,
THREE_PARAMETER,
a = p1 * b + p2 * c + p3 * d); a = p1 * b + p2 * c + p3 * d);
template<class T> template <class T>
void BaseMatrixT<T>::add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, void BaseMatrixT<T>::add3(
T p2, T p3) { BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) {
applyQuaternary(quaternary::Add3<T>(p1, p2, p3), b, c, d); applyQuaternary(quaternary::Add3<T>(p1, p2, p3), b, c, d);
} }
DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c); DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c);
template<class T> template <class T>
void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::DotMulSquare<T>(), b, c); applyTernary(ternary::DotMulSquare<T>(), b, c);
} }
DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c); DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c);
template<class T> template <class T>
void BaseMatrixT<T>::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::DotSquareSquare<T>(), b, c); applyTernary(ternary::DotSquareSquare<T>(), b, c);
} }
DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b); DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b);
template<class T> template <class T>
void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b) { void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b) {
applyBinary(binary::DotMulSquare<T>(), b); applyBinary(binary::DotMulSquare<T>(), b);
} }
DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b); DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b);
template<class T> template <class T>
void BaseMatrixT<T>::dotSquareMul(BaseMatrixT& b) { void BaseMatrixT<T>::dotSquareMul(BaseMatrixT& b) {
applyBinary(binary::DotSquareMul<T>(), b); applyBinary(binary::DotSquareMul<T>(), b);
} }
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum, THREE_PARAMETER, DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum,
THREE_PARAMETER,
T tmp = p1 * b + p2 * c + p3 * d; T tmp = p1 * b + p2 * c + p3 * d;
a += tmp * tmp); a += tmp * tmp);
template<class T> template <class T>
void BaseMatrixT<T>::addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, void BaseMatrixT<T>::addSquareSum(
T p1, T p2, T p3) { BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) {
applyQuaternary(quaternary::AddSquareSum<T>(p1, p2, p3), b, c, d); applyQuaternary(quaternary::AddSquareSum<T>(p1, p2, p3), b, c, d);
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b); DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b);
template<class T> template <class T>
void BaseMatrixT<T>::addSquare(BaseMatrixT& b, T p) { void BaseMatrixT<T>::addSquare(BaseMatrixT& b, T p) {
applyBinary(binary::AddSquare<T>(p), b); applyBinary(binary::AddSquare<T>(p), b);
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare, TWO_PARAMETER, DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare,
TWO_PARAMETER,
a = p1 * a + p2 * b * b); a = p1 * a + p2 * b * b);
template<class T> template <class T>
void BaseMatrixT<T>::decayAddSquare(BaseMatrixT& b, T p1, T p2) { void BaseMatrixT<T>::decayAddSquare(BaseMatrixT& b, T p1, T p2) {
applyBinary(binary::DecayAddSquare<T>(p1, p2), b); applyBinary(binary::DecayAddSquare<T>(p1, p2), b);
} }
DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul, TWO_PARAMETER, DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul,
TWO_PARAMETER,
a = p1 * a + p2 * b * b * c * c); a = p1 * a + p2 * b * b * c * c);
template<class T> template <class T>
void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1, void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b,
BaseMatrixT& c,
T p1,
T p2) { T p2) {
applyTernary(ternary::DecayAddSquareMul<T>(p1, p2), b, c); applyTernary(ternary::DecayAddSquareMul<T>(p1, p2), b, c);
} }
DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum, THREE_PARAMETER, DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum,
THREE_PARAMETER,
a = 1 / (p1 * b + p2 * c + p3)); a = 1 / (p1 * b + p2 * c + p3));
template<class T> template <class T>
void BaseMatrixT<T>::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, void BaseMatrixT<T>::reciprocalSum(
T p3) { BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
applyTernary(ternary::ReciprocalSum<T>(p1, p2, p3), b, c); applyTernary(ternary::ReciprocalSum<T>(p1, p2, p3), b, c);
} }
DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER, DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2,
TWO_PARAMETER,
a = 1 / (p1 * b + p2)); a = 1 / (p1 * b + p2));
template<class T> template <class T>
void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) { void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
applyBinary(binary::Reciprocal2<T>(p1, p2), b); applyBinary(binary::Reciprocal2<T>(p1, p2), b);
} }
DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum, TWO_PARAMETER, DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum,
TWO_PARAMETER,
T tmp = p1 * b + p2 * c; T tmp = p1 * b + p2 * c;
a *= tmp * tmp); a *= tmp * tmp);
template<class T> template <class T>
void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b,
BaseMatrixT& c,
T p1,
T p2) { T p2) {
applyTernary(ternary::DotMulSquareSum<T>(p1, p2), b, c); applyTernary(ternary::DotMulSquareSum<T>(p1, p2), b, c);
} }
DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum, TWO_PARAMETER, DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum,
TWO_PARAMETER,
T tmp = p1 * b + p2 * c; T tmp = p1 * b + p2 * c;
a = tmp * tmp); a = tmp * tmp);
template<class T> template <class T>
void BaseMatrixT<T>::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { void BaseMatrixT<T>::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
applyTernary(ternary::DotSquareSum<T>(p1, p2), b, c); applyTernary(ternary::DotSquareSum<T>(p1, p2), b, c);
} }
DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum, TWO_PARAMETER, DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum,
TWO_PARAMETER,
a *= p1 * b + p2 * c); a *= p1 * b + p2 * c);
template<class T> template <class T>
void BaseMatrixT<T>::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { void BaseMatrixT<T>::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
applyTernary(ternary::DotMulSum<T>(p1, p2), b, c); applyTernary(ternary::DotMulSum<T>(p1, p2), b, c);
} }
DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0); DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0);
template<class T> template <class T>
void BaseMatrixT<T>::copyAndClear(BaseMatrixT& b) { void BaseMatrixT<T>::copyAndClear(BaseMatrixT& b) {
applyBinary(binary::CopyAndClear<T>(), b); applyBinary(binary::CopyAndClear<T>(), b);
} }
DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul, TWO_PARAMETER, DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul,
TWO_PARAMETER,
a = p1 * a + p2 * b * c); a = p1 * a + p2 * b * c);
template<class T> template <class T>
void BaseMatrixT<T>::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { void BaseMatrixT<T>::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
applyTernary(ternary::AddDotMul<T>(p1, p2), b, c); applyTernary(ternary::AddDotMul<T>(p1, p2), b, c);
} }
DEFINE_MATRIX_BINARY_OP(Assign, a = b;); DEFINE_MATRIX_BINARY_OP(Assign, a = b;);
template<class T> template <class T>
void BaseMatrixT<T>::assign(BaseMatrixT& b) { void BaseMatrixT<T>::assign(BaseMatrixT& b) {
if (useGpu_) { if (useGpu_) {
applyBinary(binary::Assign<T>(), b); applyBinary(binary::Assign<T>(), b);
...@@ -1230,7 +1361,7 @@ void BaseMatrixT<T>::assign(BaseMatrixT& b) { ...@@ -1230,7 +1361,7 @@ void BaseMatrixT<T>::assign(BaseMatrixT& b) {
} }
} }
template<class T> template <class T>
void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) { void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
if (columnOffset + b.width_ <= width_) { if (columnOffset + b.width_ <= width_) {
int numRows = height_; int numRows = height_;
...@@ -1250,24 +1381,31 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) { ...@@ -1250,24 +1381,31 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
} }
DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp); DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
template<class T> template <class T>
void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) { void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
applyBinary(binary::DeepSwap<T>(), b); applyBinary(binary::DeepSwap<T>(), b);
} }
template<> template <>
void BaseMatrixT<real>::rowDotMul(size_t destCol, void BaseMatrixT<real>::rowDotMul(size_t destCol,
BaseMatrixT& b, BaseMatrixT& b,
BaseMatrixT& c) { BaseMatrixT& c) {
int numRows = b.height_; int numRows = b.height_;
int numCols = b.width_; int numCols = b.width_;
MatrixOffset offset(destCol, 0, 0, 0, 0, 0); MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c, aggregate(aggregate::sum(),
numRows, numCols, offset, false_type(), base::binary::mul(),
base::binary::add(),
b,
c,
numRows,
numCols,
offset,
false_type(),
true_type() /*aAsColVector*/); true_type() /*aAsColVector*/);
} }
template<class T> template <class T>
void BaseMatrixT<T>::rowDotMul2(size_t destCol, void BaseMatrixT<T>::rowDotMul2(size_t destCol,
BaseMatrixT& b, BaseMatrixT& b,
BaseMatrixT& c) { BaseMatrixT& c) {
...@@ -1290,17 +1428,24 @@ void BaseMatrixT<T>::rowDotMul2(size_t destCol, ...@@ -1290,17 +1428,24 @@ void BaseMatrixT<T>::rowDotMul2(size_t destCol,
} }
} }
template<> template <>
void BaseMatrixT<real>::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<real>::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) {
MatrixOffset offset(0, 0, 0, 0, 0, 0); MatrixOffset offset(0, 0, 0, 0, 0, 0);
int numRows = b.height_; int numRows = b.height_;
int numCols = b.width_; int numCols = b.width_;
aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c, aggregate(aggregate::sum(),
numRows, numCols, offset, true_type() /*aAsRowVector*/, base::binary::mul(),
base::binary::add(),
b,
c,
numRows,
numCols,
offset,
true_type() /*aAsRowVector*/,
false_type()); false_type());
} }
template<class T> template <class T>
void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
CHECK(!useGpu_) << "do not support gpu"; CHECK(!useGpu_) << "do not support gpu";
...@@ -1321,16 +1466,22 @@ void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) { ...@@ -1321,16 +1466,22 @@ void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
} }
DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c); DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c);
template<class T> template <class T>
void BaseMatrixT<T>::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) {
MatrixOffset offset(0, 0, 0, 0, 0, 0); MatrixOffset offset(0, 0, 0, 0, 0, 0);
int numRows = height_; int numRows = height_;
int numCols = width_; int numCols = width_;
applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset, applyTernary(ternary::addDotMulMMV<T>(),
true_type() /*cAsRowVector*/, false_type()); b,
c,
numRows,
numCols,
offset,
true_type() /*cAsRowVector*/,
false_type());
} }
template<class T> template <class T>
void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
CHECK(!useGpu_) << "do not support gpu"; CHECK(!useGpu_) << "do not support gpu";
...@@ -1350,16 +1501,22 @@ void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) { ...@@ -1350,16 +1501,22 @@ void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
} }
} }
template<class T> template <class T>
void BaseMatrixT<T>::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
MatrixOffset offset(0, 0, 0, 0, cCol, 0); MatrixOffset offset(0, 0, 0, 0, cCol, 0);
int numRows = height_; int numRows = height_;
int numCols = width_; int numCols = width_;
applyTernary(ternary::DotMul<T>(), b, c, numRows, numCols, offset, applyTernary(ternary::DotMul<T>(),
false_type(), true_type() /*cAsColVector*/); b,
c,
numRows,
numCols,
offset,
false_type(),
true_type() /*cAsColVector*/);
} }
template<class T> template <class T>
void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
CHECK(!useGpu_) << "do not support gpu"; CHECK(!useGpu_) << "do not support gpu";
...@@ -1379,52 +1536,82 @@ void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { ...@@ -1379,52 +1536,82 @@ void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
} }
} }
template<class T> template <class T>
void BaseMatrixT<T>::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
MatrixOffset offset(0, 0, 0, 0, 0, cRow); MatrixOffset offset(0, 0, 0, 0, 0, cRow);
int numRows = height_; int numRows = height_;
int numCols = width_; int numCols = width_;
applyTernary(ternary::DotMul<T>(), b, c, numRows, numCols, offset, applyTernary(ternary::DotMul<T>(),
true_type() /* cAsRowVector */, false_type() /* cAsColVector */); b,
c,
numRows,
numCols,
offset,
true_type() /* cAsRowVector */,
false_type() /* cAsColVector */);
} }
template<class T> template <class T>
void BaseMatrixT<T>::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
MatrixOffset offset(0, 0, 0, 0, 0, cRow); MatrixOffset offset(0, 0, 0, 0, 0, cRow);
int numRows = height_; int numRows = height_;
int numCols = width_; int numCols = width_;
applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset, applyTernary(ternary::addDotMulMMV<T>(),
true_type() /* cAsRowVector */, false_type() /* cAsColVector */); b,
c,
numRows,
numCols,
offset,
true_type() /* cAsRowVector */,
false_type() /* cAsColVector */);
} }
template<class T> template <class T>
void BaseMatrixT<T>::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<T>::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
MatrixOffset offset(0, 0, 0, 0, cCol, 0); MatrixOffset offset(0, 0, 0, 0, cCol, 0);
int numRows = height_; int numRows = height_;
int numCols = width_; int numCols = width_;
applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset, applyTernary(ternary::addDotMulMMV<T>(),
false_type(), true_type() /*cAsColVector*/); b,
c,
numRows,
numCols,
offset,
false_type(),
true_type() /*cAsColVector*/);
} }
DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c); DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c);
template<class T> template <class T>
void BaseMatrixT<T>::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) { void BaseMatrixT<T>::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) {
MatrixOffset offset(0, 0, 0, 0, cCol, 0); MatrixOffset offset(0, 0, 0, 0, cCol, 0);
int numRows = height_; int numRows = height_;
int numCols = width_; int numCols = width_;
applyTernary(ternary::RowAdd<T>(p), b, c, numRows, numCols, offset, applyTernary(ternary::RowAdd<T>(p),
false_type(), true_type() /*cAsColVector*/); b,
c,
numRows,
numCols,
offset,
false_type(),
true_type() /*cAsColVector*/);
} }
DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c)); DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c));
template<> template <>
void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
if (useGpu_) { if (useGpu_) {
MatrixOffset offset(0, 0, 0, 0, cCol, 0); MatrixOffset offset(0, 0, 0, 0, cCol, 0);
int numRows = height_; int numRows = height_;
int numCols = width_; int numCols = width_;
applyTernary(ternary::RowPow<real>(), b, c, numRows, numCols, offset, applyTernary(ternary::RowPow<real>(),
false_type(), true_type() /*cAsColVector*/); b,
c,
numRows,
numCols,
offset,
false_type(),
true_type() /*cAsColVector*/);
} else { } else {
size_t height = this->height_; size_t height = this->height_;
size_t width = this->width_; size_t width = this->width_;
...@@ -1441,44 +1628,64 @@ void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { ...@@ -1441,44 +1628,64 @@ void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
} }
} }
template<class T> template <class T>
void BaseMatrixT<T>::mulRowVector(BaseMatrixT& b) { void BaseMatrixT<T>::mulRowVector(BaseMatrixT& b) {
MatrixOffset offset(0, 0, 0, 0); MatrixOffset offset(0, 0, 0, 0);
int numRows = height_; int numRows = height_;
int numCols = width_; int numCols = width_;
applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset, applyBinary(binary::DotMul<T>(),
true_type() /* bAsRowVector */, false_type()); b,
numRows,
numCols,
offset,
true_type() /* bAsRowVector */,
false_type());
} }
DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b); DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b);
template<class T> template <class T>
void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) { void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
MatrixOffset offset(0, 0, 0, 0); MatrixOffset offset(0, 0, 0, 0);
int numRows = height_; int numRows = height_;
int numCols = width_; int numCols = width_;
applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset, applyBinary(binary::DotDiv<T>(),
true_type() /* bAsRowVector */, false_type()); b,
numRows,
numCols,
offset,
true_type() /* bAsRowVector */,
false_type());
} }
template<class T> template <class T>
void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) { void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
MatrixOffset offset(0, 0, 0, 0); MatrixOffset offset(0, 0, 0, 0);
int numRows = height_; int numRows = height_;
int numCols = width_; int numCols = width_;
applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset, applyBinary(binary::DotMul<T>(),
false_type(), true_type() /* bAsColVector */); b,
numRows,
numCols,
offset,
false_type(),
true_type() /* bAsColVector */);
} }
template<class T> template <class T>
void BaseMatrixT<T>::divColVector(BaseMatrixT& b) { void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
MatrixOffset offset(0, 0, 0, 0); MatrixOffset offset(0, 0, 0, 0);
int numRows = height_; int numRows = height_;
int numCols = width_; int numCols = width_;
applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset, applyBinary(binary::DotDiv<T>(),
false_type(), true_type() /* bAsColVector */); b,
numRows,
numCols,
offset,
false_type(),
true_type() /* bAsColVector */);
} }
template<> template <>
template <class Agg> template <class Agg>
int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) { int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
MatrixOffset offset(0, 0, 0, 0, 0, 0); MatrixOffset offset(0, 0, 0, 0, 0, 0);
...@@ -1486,13 +1693,20 @@ int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) { ...@@ -1486,13 +1693,20 @@ int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
size_t numCols = b.width_; size_t numCols = b.width_;
CHECK_EQ(height_, numRows); CHECK_EQ(height_, numRows);
CHECK_EQ(width_, 1UL); CHECK_EQ(width_, 1UL);
aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows, aggregate(agg,
numCols, offset, false_type(), true_type() /*aAsColVector*/); base::unary::identity(),
base::binary::second(),
b,
numRows,
numCols,
offset,
false_type(),
true_type() /*aAsColVector*/);
return 0; return 0;
} }
template<> template <>
template <class Agg, class Saver> template <class Agg, class Saver>
int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) { int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
MatrixOffset offset(0, 0, 0, 0, 0, 0); MatrixOffset offset(0, 0, 0, 0, 0, 0);
...@@ -1500,16 +1714,25 @@ int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) { ...@@ -1500,16 +1714,25 @@ int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
size_t numCols = b.width_; size_t numCols = b.width_;
CHECK_EQ(height_, numRows); CHECK_EQ(height_, numRows);
CHECK_EQ(width_, 1UL); CHECK_EQ(width_, 1UL);
aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset, aggregate(agg,
false_type(), true_type() /*aAsColVector*/); base::unary::identity(),
sv,
b,
numRows,
numCols,
offset,
false_type(),
true_type() /*aAsColVector*/);
return 0; return 0;
} }
template<> template <>
template <class Agg> template <class Agg>
int BaseMatrixT<real>::applyRow( int BaseMatrixT<real>::applyRow(Agg agg,
Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) { real scaleDest,
real scaleAgg,
BaseMatrixT& b) {
if (scaleDest != 0) { if (scaleDest != 0) {
applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b); applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
} else { } else {
...@@ -1521,10 +1744,10 @@ int BaseMatrixT<real>::applyRow( ...@@ -1521,10 +1744,10 @@ int BaseMatrixT<real>::applyRow(
return 0; return 0;
} }
template<> template <>
template <class Agg, class Op, class Saver> template <class Agg, class Op, class Saver>
int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv, int BaseMatrixT<real>::applyRow(
BaseMatrixT& b, BaseMatrixT& c) { Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) {
MatrixOffset offset(0, 0, 0, 0, 0, 0); MatrixOffset offset(0, 0, 0, 0, 0, 0);
size_t numRows = b.height_; size_t numRows = b.height_;
size_t numCols = b.width_; size_t numCols = b.width_;
...@@ -1532,16 +1755,27 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv, ...@@ -1532,16 +1755,27 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
CHECK_EQ(width_, 1UL); CHECK_EQ(width_, 1UL);
CHECK_EQ(c.height_, numRows); CHECK_EQ(c.height_, numRows);
CHECK_EQ(c.width_, numCols); CHECK_EQ(c.width_, numCols);
aggregate(agg, op, sv, aggregate(agg,
b, c, numRows, numCols, offset, op,
false_type(), true_type() /*aAsColVector*/); sv,
b,
c,
numRows,
numCols,
offset,
false_type(),
true_type() /*aAsColVector*/);
return 0; return 0;
} }
template<> template <>
template <class Agg, class Op> template <class Agg, class Op>
int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg, int BaseMatrixT<real>::applyRow(Agg agg,
BaseMatrixT& b, BaseMatrixT& c) { Op op,
real scaleDest,
real scaleAgg,
BaseMatrixT& b,
BaseMatrixT& c) {
if (scaleDest != 0) { if (scaleDest != 0) {
applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c); applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
} else { } else {
...@@ -1553,7 +1787,7 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg, ...@@ -1553,7 +1787,7 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
return 0; return 0;
} }
template<> template <>
template <class Agg> template <class Agg>
int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) { int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
MatrixOffset offset(0, 0, 0, 0, 0, 0); MatrixOffset offset(0, 0, 0, 0, 0, 0);
...@@ -1561,13 +1795,20 @@ int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) { ...@@ -1561,13 +1795,20 @@ int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
size_t numCols = b.width_; size_t numCols = b.width_;
CHECK_EQ(width_, numCols); CHECK_EQ(width_, numCols);
CHECK_EQ(height_, 1UL); CHECK_EQ(height_, 1UL);
aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows, aggregate(agg,
numCols, offset, true_type() /*aAsRowVector*/, false_type()); base::unary::identity(),
base::binary::second(),
b,
numRows,
numCols,
offset,
true_type() /*aAsRowVector*/,
false_type());
return 0; return 0;
} }
template<> template <>
template <class Agg, class Saver> template <class Agg, class Saver>
int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) { int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
MatrixOffset offset(0, 0, 0, 0, 0, 0); MatrixOffset offset(0, 0, 0, 0, 0, 0);
...@@ -1575,16 +1816,25 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) { ...@@ -1575,16 +1816,25 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
size_t numCols = b.width_; size_t numCols = b.width_;
CHECK_EQ(width_, numCols); CHECK_EQ(width_, numCols);
CHECK_EQ(height_, 1UL); CHECK_EQ(height_, 1UL);
aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset, aggregate(agg,
true_type() /*aAsRowVector*/, false_type()); base::unary::identity(),
sv,
b,
numRows,
numCols,
offset,
true_type() /*aAsRowVector*/,
false_type());
return 0; return 0;
} }
template<> template <>
template <class Agg> template <class Agg>
int BaseMatrixT<real>::applyCol( int BaseMatrixT<real>::applyCol(Agg agg,
Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) { real scaleDest,
real scaleAgg,
BaseMatrixT& b) {
if (scaleDest != 0) { if (scaleDest != 0) {
applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b); applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
} else { } else {
...@@ -1596,48 +1846,51 @@ int BaseMatrixT<real>::applyCol( ...@@ -1596,48 +1846,51 @@ int BaseMatrixT<real>::applyCol(
return 0; return 0;
} }
template<> template <>
void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) { void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
applyRow(aggregate::sum(), scaleDest, scaleSum, b); applyRow(aggregate::sum(), scaleDest, scaleSum, b);
} }
template<> template <>
void BaseMatrixT<real>::maxRows(BaseMatrixT& b) { void BaseMatrixT<real>::maxRows(BaseMatrixT& b) {
applyRow(aggregate::max(), b); applyRow(aggregate::max(), b);
} }
template<> template <>
void BaseMatrixT<real>::minRows(BaseMatrixT& b) { void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
applyRow(aggregate::min(), b); applyRow(aggregate::min(), b);
} }
template<> template <>
void BaseMatrixT<real>::maxCols(BaseMatrixT& b) { void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
applyCol(aggregate::max(), b); applyCol(aggregate::max(), b);
} }
template<> template <>
void BaseMatrixT<real>::minCols(BaseMatrixT& b) { void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
applyCol(aggregate::min(), b); applyCol(aggregate::min(), b);
} }
template<> template <>
void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) { void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
applyCol(aggregate::sum(), scaleDest, scaleSum, b); applyCol(aggregate::sum(), scaleDest, scaleSum, b);
} }
template<> template <>
void BaseMatrixT<real>::sumOfSquaredDiffs( void BaseMatrixT<real>::sumOfSquaredDiffs(BaseMatrixT& b,
BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) { BaseMatrixT& c,
applyRow(aggregate::sum(), base::binary::squaredDiff(), real scaleSum,
scaleDest, scaleSum, b, c); real scaleDest) {
applyRow(
aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c);
} }
template<> template <>
void BaseMatrixT<real>::sumOfProducts( void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) { BaseMatrixT& c,
applyRow(aggregate::sum(), base::binary::mul(), real scaleSum,
scaleDest, scaleSum, b, c); real scaleDest) {
applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c);
} }
template class BaseMatrixT<real>; template class BaseMatrixT<real>;
......
...@@ -25,7 +25,7 @@ namespace paddle { ...@@ -25,7 +25,7 @@ namespace paddle {
*/ */
void sparseRand( void sparseRand(
int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) { int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) {
CHECK(size_t(nnz) > size_t(1)); CHECK(size_t(nnz) >= size_t(1));
int* cpuMajor; int* cpuMajor;
int* cpuMinor; int* cpuMinor;
CpuIVector cpuMinorVec(nnz); CpuIVector cpuMinorVec(nnz);
......
...@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/utils/Logging.h"
#include "BaseMatrix.h" #include "BaseMatrix.h"
#include "TrainingAlgorithmOp.h" #include "TrainingAlgorithmOp.h"
#include "paddle/utils/Logging.h"
#if __cplusplus > 199711L #if __cplusplus > 199711L
...@@ -32,10 +32,10 @@ void sparseMomentumApply(BaseMatrix& value, ...@@ -32,10 +32,10 @@ void sparseMomentumApply(BaseMatrix& value,
real tau, real tau,
real learningRate) { real learningRate) {
auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad); auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
auto expr2 = momV.lazyAssign( auto expr2 =
momV + (tau * alpha * gamma * learningRate) * grad); momV.lazyAssign(momV + (tau * alpha * gamma * learningRate) * grad);
auto expr3 = value.lazyAssign( auto expr3 = value.lazyAssign((tau / beta + (real)1 / alpha) * momU +
(tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV); ((real)1 / beta) * momV);
AssignEvaluate(expr1, expr2, expr3); AssignEvaluate(expr1, expr2, expr3);
} }
...@@ -52,12 +52,12 @@ void adadeltaApply(BaseMatrix& value, ...@@ -52,12 +52,12 @@ void adadeltaApply(BaseMatrix& value,
real momentum, real momentum,
real decayRate) { real decayRate) {
auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square()); auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
auto expr2 = lr.lazyAssign( auto expr2 =
((accum_update + epsilon) / (accum + epsilon)).sqrt()); lr.lazyAssign(((accum_update + epsilon) / (accum + epsilon)).sqrt());
auto expr3 = accum_update.lazyAssign( auto expr3 = accum_update.lazyAssign(rou * accum_update +
rou * accum_update + ((real)1 - rou) * (grad * lr).square()); ((real)1 - rou) * (grad * lr).square());
auto expr4 = mom.lazyAssign( auto expr4 = mom.lazyAssign(mom * momentum -
mom * momentum - learningRate * lr * (grad + value * decayRate)); learningRate * lr * (grad + value * decayRate));
auto expr5 = value.lazyAssign(value + mom); auto expr5 = value.lazyAssign(value + mom);
AssignEvaluate(expr1, expr2, expr3, expr4, expr5); AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
...@@ -74,10 +74,10 @@ void adagradApply(BaseMatrix& value, ...@@ -74,10 +74,10 @@ void adagradApply(BaseMatrix& value,
real momentum, real momentum,
real decayRate) { real decayRate) {
auto expr1 = accum.lazyAssign(accum + grad.square()); auto expr1 = accum.lazyAssign(accum + grad.square());
auto expr2 = lr.lazyAssign( auto expr2 =
(accum_buffer + accum + epsilon).sqrt().reciprocal()); lr.lazyAssign((accum_buffer + accum + epsilon).sqrt().reciprocal());
auto expr3 = mom.lazyAssign( auto expr3 = mom.lazyAssign(mom * momentum -
mom * momentum - learningRate * lr * (grad + value * decayRate)); learningRate * lr * (grad + value * decayRate));
auto expr4 = value.lazyAssign(value + mom); auto expr4 = value.lazyAssign(value + mom);
AssignEvaluate(expr1, expr2, expr3, expr4); AssignEvaluate(expr1, expr2, expr3, expr4);
...@@ -98,8 +98,8 @@ void rmspropApply(BaseMatrix& value, ...@@ -98,8 +98,8 @@ void rmspropApply(BaseMatrix& value,
bool firstTime) { bool firstTime) {
auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad); auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal()); auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
auto expr4 = mom.lazyAssign( auto expr4 = mom.lazyAssign(mom * momentum -
mom * momentum - learningRate * lr * (grad + value * decayRate)); learningRate * lr * (grad + value * decayRate));
auto expr5 = value.lazyAssign(value + mom); auto expr5 = value.lazyAssign(value + mom);
if (firstTime) { if (firstTime) {
...@@ -107,8 +107,8 @@ void rmspropApply(BaseMatrix& value, ...@@ -107,8 +107,8 @@ void rmspropApply(BaseMatrix& value,
AssignEvaluate(expr1, expr2, expr3, expr4, expr5); AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
} else { } else {
auto expr1 = g.lazyAssign( auto expr1 =
accumulatedRou * g + ((real)1 - rou) * grad.square()); g.lazyAssign(accumulatedRou * g + ((real)1 - rou) * grad.square());
AssignEvaluate(expr1, expr2, expr3, expr4, expr5); AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
} }
...@@ -127,8 +127,8 @@ void decayedAdagradApply(BaseMatrix& value, ...@@ -127,8 +127,8 @@ void decayedAdagradApply(BaseMatrix& value,
real decayRate, real decayRate,
bool firstTime) { bool firstTime) {
auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal()); auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
auto expr3 = mom.lazyAssign( auto expr3 = mom.lazyAssign(mom * momentum -
mom * momentum - learningRate * lr * (grad + value * decayRate)); learningRate * lr * (grad + value * decayRate));
auto expr4 = value.lazyAssign(value + mom); auto expr4 = value.lazyAssign(value + mom);
if (firstTime) { if (firstTime) {
...@@ -136,8 +136,8 @@ void decayedAdagradApply(BaseMatrix& value, ...@@ -136,8 +136,8 @@ void decayedAdagradApply(BaseMatrix& value,
AssignEvaluate(expr1, expr2, expr3, expr4); AssignEvaluate(expr1, expr2, expr3, expr4);
} else { } else {
auto expr1 = accum.lazyAssign( auto expr1 = accum.lazyAssign(accumulatedRou * accum +
accumulatedRou * accum + ((real)1 - rou) * grad.square()); ((real)1 - rou) * grad.square());
AssignEvaluate(expr1, expr2, expr3, expr4); AssignEvaluate(expr1, expr2, expr3, expr4);
} }
...@@ -153,13 +153,12 @@ void adamApply(BaseMatrix& value, ...@@ -153,13 +153,12 @@ void adamApply(BaseMatrix& value,
real beta2_power, real beta2_power,
real epsilon, real epsilon,
real learningRate) { real learningRate) {
real alpha = learningRate * real alpha =
std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad); auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square()); auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
auto expr3 = value.lazyAssign( auto expr3 = value.lazyAssign(value - (mom * alpha) / (v.sqrt() + epsilon));
value - (mom * alpha) / (v.sqrt() + epsilon));
AssignEvaluate(expr1, expr2, expr3); AssignEvaluate(expr1, expr2, expr3);
} }
...@@ -173,10 +172,10 @@ void adamaxApply(BaseMatrix& value, ...@@ -173,10 +172,10 @@ void adamaxApply(BaseMatrix& value,
int64_t step, int64_t step,
real alpha) { real alpha) {
auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad); auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
auto expr2 = u.lazyAssign( auto expr2 =
(beta2 * u > grad.abs()).condition(beta2 * u, grad.abs())); u.lazyAssign((beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
auto expr3 = value.lazyAssign( auto expr3 = value.lazyAssign(
value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u)); value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
AssignEvaluate(expr1, expr2, expr3); AssignEvaluate(expr1, expr2, expr3);
} }
...@@ -322,8 +321,8 @@ void adamApply(BaseMatrix& value, ...@@ -322,8 +321,8 @@ void adamApply(BaseMatrix& value,
real beta2_power, real beta2_power,
real epsilon, real epsilon,
real learningRate) { real learningRate) {
real alpha = learningRate * real alpha =
std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t; // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
mom = beta1 * mom + ((real)1 - beta1) * grad; mom = beta1 * mom + ((real)1 - beta1) * grad;
...@@ -331,7 +330,7 @@ void adamApply(BaseMatrix& value, ...@@ -331,7 +330,7 @@ void adamApply(BaseMatrix& value,
// v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2 // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
v = beta2 * v + ((real)1 - beta2) * grad.square(); v = beta2 * v + ((real)1 - beta2) * grad.square();
value -= (mom * alpha) / (v.sqrt() + epsilon); value -= (mom * alpha) / (v.sqrt() + epsilon);
} }
void adamaxApply(BaseMatrix& value, void adamaxApply(BaseMatrix& value,
......
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/math/Matrix.h"
#include "TensorCheck.h" #include "TensorCheck.h"
#include "paddle/math/Matrix.h"
using paddle::Matrix; using paddle::Matrix;
using paddle::CpuMatrix; using paddle::CpuMatrix;
...@@ -26,25 +26,25 @@ using paddle::GpuIVector; ...@@ -26,25 +26,25 @@ using paddle::GpuIVector;
using autotest::TensorCheckEqual; using autotest::TensorCheckEqual;
using autotest::TensorCheckErr; using autotest::TensorCheckErr;
#define INIT_UNARY(A1, A2) \ #define INIT_UNARY(A1, A2) \
Tensor A1(height, width); \ Tensor A1(height, width); \
Tensor A2(height, width); \ Tensor A2(height, width); \
A1.randomizeUniform(); \ A1.randomizeUniform(); \
A2.copyFrom(A1) A2.copyFrom(A1)
#define INIT_BINARY(A1, A2, B) \ #define INIT_BINARY(A1, A2, B) \
INIT_UNARY(A1, A2); \ INIT_UNARY(A1, A2); \
Tensor B(height, width); \ Tensor B(height, width); \
B.randomizeUniform() B.randomizeUniform()
#define INIT_TERNARY(A1, A2, B, C) \ #define INIT_TERNARY(A1, A2, B, C) \
INIT_BINARY(A1, A2, B); \ INIT_BINARY(A1, A2, B); \
Tensor C(height, width); \ Tensor C(height, width); \
C.randomizeUniform() C.randomizeUniform()
#define INIT_QUATERNARY(A1, A2, B, C, D) \ #define INIT_QUATERNARY(A1, A2, B, C, D) \
INIT_TERNARY(A1, A2, B, C); \ INIT_TERNARY(A1, A2, B, C); \
Tensor D(height, width); \ Tensor D(height, width); \
D.randomizeUniform() D.randomizeUniform()
template<typename Tensor> template <typename Tensor>
struct TestUnaryMatrix { struct TestUnaryMatrix {
typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc; typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
...@@ -59,7 +59,7 @@ struct TestUnaryMatrix { ...@@ -59,7 +59,7 @@ struct TestUnaryMatrix {
} }
}; };
template<typename Tensor> template <typename Tensor>
struct TestBinaryMatrix { struct TestBinaryMatrix {
typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc; typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
...@@ -74,10 +74,10 @@ struct TestBinaryMatrix { ...@@ -74,10 +74,10 @@ struct TestBinaryMatrix {
} }
}; };
template<typename Tensor> template <typename Tensor>
struct TestTernaryMatrix { struct TestTernaryMatrix {
typedef std::function<void( typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)>
Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)> TernaryFunc; TernaryFunc;
explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) { explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
for (auto height : {1, 11, 73, 128, 200, 330}) { for (auto height : {1, 11, 73, 128, 200, 330}) {
...@@ -90,10 +90,11 @@ struct TestTernaryMatrix { ...@@ -90,10 +90,11 @@ struct TestTernaryMatrix {
} }
}; };
template<typename Tensor> template <typename Tensor>
struct TestQuaternaryMatrix { struct TestQuaternaryMatrix {
typedef std::function<void( typedef std::function<void(
Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)> QuaternaryFunc; Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)>
QuaternaryFunc;
explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) { explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
for (auto height : {1, 11, 73, 128, 200, 330}) { for (auto height : {1, 11, 73, 128, 200, 330}) {
...@@ -106,7 +107,7 @@ struct TestQuaternaryMatrix { ...@@ -106,7 +107,7 @@ struct TestQuaternaryMatrix {
} }
}; };
template<typename Tensor, class T> template <typename Tensor, class T>
struct TestUnaryVectorT { struct TestUnaryVectorT {
typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc; typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
...@@ -142,11 +143,11 @@ void SetTensorValue(Matrix& matrix, real value) { ...@@ -142,11 +143,11 @@ void SetTensorValue(Matrix& matrix, real value) {
} }
} }
template<typename Tensor> template <typename Tensor>
void testTensorAddScalar(Tensor& A1, Tensor& A2) { void testTensorAddScalar(Tensor& A1, Tensor& A2) {
real p1 = 2.5; real p1 = 2.5;
real p2 = 3.0; real p2 = 3.0;
A1.add(p1); // a += p A1.add(p1); // a += p
A2 += p1; A2 += p1;
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
...@@ -155,7 +156,7 @@ void testTensorAddScalar(Tensor& A1, Tensor& A2) { ...@@ -155,7 +156,7 @@ void testTensorAddScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSubScalar(Tensor& A1, Tensor& A2) { void testTensorSubScalar(Tensor& A1, Tensor& A2) {
real p = 2.5; real p = 2.5;
A1.subScalar(p); // a -= p A1.subScalar(p); // a -= p
...@@ -163,7 +164,7 @@ void testTensorSubScalar(Tensor& A1, Tensor& A2) { ...@@ -163,7 +164,7 @@ void testTensorSubScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorMulScalar(Tensor& A1, Tensor& A2) { void testTensorMulScalar(Tensor& A1, Tensor& A2) {
real p = 2.5; real p = 2.5;
A1.mulScalar(p); // a *= p A1.mulScalar(p); // a *= p
...@@ -177,7 +178,7 @@ void testTensorMulScalar(Tensor& A1, Tensor& A2) { ...@@ -177,7 +178,7 @@ void testTensorMulScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorDivScalar(Tensor& A1, Tensor& A2) { void testTensorDivScalar(Tensor& A1, Tensor& A2) {
real p = 2.5; real p = 2.5;
A1.divScalar(p); // a /= p A1.divScalar(p); // a /= p
...@@ -185,44 +186,44 @@ void testTensorDivScalar(Tensor& A1, Tensor& A2) { ...@@ -185,44 +186,44 @@ void testTensorDivScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorNeg(Tensor& A1, Tensor& A2) { void testTensorNeg(Tensor& A1, Tensor& A2) {
A1.neg(); // a = -a A1.neg(); // a = -a
A2 = -A2; A2 = -A2;
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorAbs(Tensor& A1, Tensor& A2) { void testTensorAbs(Tensor& A1, Tensor& A2) {
A1.abs2(); // a = a > 0 ? a : -a A1.abs2(); // a = a > 0 ? a : -a
A2 = A2.abs(); A2 = A2.abs();
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSquare(Tensor& A1, Tensor& A2) { void testTensorSquare(Tensor& A1, Tensor& A2) {
A1.square2(); // a = a * a A1.square2(); // a = a * a
A2 = A2.square(); A2 = A2.square();
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorReciprocal(Tensor& A1, Tensor& A2) { void testTensorReciprocal(Tensor& A1, Tensor& A2) {
A1.reciprocal2(); // a = 1.0f / a A1.reciprocal2(); // a = 1.0f / a
A2 = A2.reciprocal(); A2 = A2.reciprocal();
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSign(Tensor& A1, Tensor& A2) { void testTensorSign(Tensor& A1, Tensor& A2) {
A1.sign2(); // a = (a > 0) - (a < 0) A1.sign2(); // a = (a > 0) - (a < 0)
A2 = A2.sign(); A2 = A2.sign();
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorAssign(Tensor& A1, Tensor& A2) { void testTensorAssign(Tensor& A1, Tensor& A2) {
A1.assign(1.5); // a = p A1.assign(1.5); // a = p
A2 = A2.constant(1.5); A2 = A2.constant(1.5);
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
...@@ -235,7 +236,7 @@ void testTensorAssign(Tensor& A1, Tensor& A2) { ...@@ -235,7 +236,7 @@ void testTensorAssign(Tensor& A1, Tensor& A2) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testUnaryBaseOp(Tensor& A1, Tensor& A2) { void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
testTensorAddScalar(A1, A2); testTensorAddScalar(A1, A2);
testTensorSubScalar(A1, A2); testTensorSubScalar(A1, A2);
...@@ -249,9 +250,9 @@ void testUnaryBaseOp(Tensor& A1, Tensor& A2) { ...@@ -249,9 +250,9 @@ void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
testTensorAssign(A1, A2); testTensorAssign(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) { void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
A1.add(2); // a += p A1.add(2); // a += p
A2 += 2; A2 += 2;
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
...@@ -266,46 +267,46 @@ void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) { ...@@ -266,46 +267,46 @@ void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
TEST(Unary, BaseOp) { TEST(Unary, BaseOp) {
TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>); TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>); TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
TestUnaryVectorT<CpuIVector, int> TestUnaryVectorT<CpuIVector, int> testCpuIVector(
testCpuIVector(testUnaryBaseOpInt<CpuIVector>); testUnaryBaseOpInt<CpuIVector>);
#ifndef PADDLE_ONLY_CPU #ifndef PADDLE_ONLY_CPU
TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>); TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>); TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
TestUnaryVectorT<GpuIVector, int> TestUnaryVectorT<GpuIVector, int> testGpuIVector(
testGpuIVector(testUnaryBaseOpInt<GpuIVector>); testUnaryBaseOpInt<GpuIVector>);
#endif #endif
} }
template<typename Tensor> template <typename Tensor>
void testTensorExp(Tensor& A1, Tensor& A2) { void testTensorExp(Tensor& A1, Tensor& A2) {
A1.exp2(); // a = exp(a) A1.exp2(); // a = exp(a)
A2 = A2.exp(); A2 = A2.exp();
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorLog(Tensor& A1, Tensor& A2) { void testTensorLog(Tensor& A1, Tensor& A2) {
A1.log2(); // a = log(a) A1.log2(); // a = log(a)
A2 = A2.log(); A2 = A2.log();
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSqrt(Tensor& A1, Tensor& A2) { void testTensorSqrt(Tensor& A1, Tensor& A2) {
A1.sqrt2(); // a = sqrt(a) A1.sqrt2(); // a = sqrt(a)
A2 = A2.sqrt(); A2 = A2.sqrt();
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorPow(Tensor& A1, Tensor& A2) { void testTensorPow(Tensor& A1, Tensor& A2) {
A1.pow2(3.2); // a = pow(a, p) A1.pow2(3.2); // a = pow(a, p)
A2 = A2.pow(3.2); A2 = A2.pow(3.2);
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testUnayrMathOp(Tensor& A1, Tensor& A2) { void testUnayrMathOp(Tensor& A1, Tensor& A2) {
testTensorExp(A1, A2); testTensorExp(A1, A2);
testTensorLog(A1, A2); testTensorLog(A1, A2);
...@@ -321,7 +322,7 @@ TEST(Unary, MathOp) { ...@@ -321,7 +322,7 @@ TEST(Unary, MathOp) {
#endif #endif
} }
template<typename Tensor> template <typename Tensor>
void testTensorClip(Tensor& A1, Tensor& A2) { void testTensorClip(Tensor& A1, Tensor& A2) {
real p1 = 0.003f; real p1 = 0.003f;
real p2 = 0.877f; real p2 = 0.877f;
...@@ -331,7 +332,7 @@ void testTensorClip(Tensor& A1, Tensor& A2) { ...@@ -331,7 +332,7 @@ void testTensorClip(Tensor& A1, Tensor& A2) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) { void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
real p = 0.5f; real p = 0.5f;
A1.biggerThanScalar(p); // a = a > p ? 1.0f : 0.0f A1.biggerThanScalar(p); // a = a > p ? 1.0f : 0.0f
...@@ -339,7 +340,7 @@ void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) { ...@@ -339,7 +340,7 @@ void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorapplyL1(Tensor& A1, Tensor& A2) { void testTensorapplyL1(Tensor& A1, Tensor& A2) {
/** /**
* T lambda = p; * T lambda = p;
...@@ -351,14 +352,15 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2) { ...@@ -351,14 +352,15 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2) {
real learningRate = 0.7f; real learningRate = 0.7f;
real decayRate = 0.6f; real decayRate = 0.6f;
A1.applyL1(learningRate, decayRate); A1.applyL1(learningRate, decayRate);
A2 = (A2 > (learningRate * decayRate)).condition( A2 = (A2 > (learningRate * decayRate))
(A2 - (learningRate * decayRate)), .condition(
(A2 < -(learningRate * decayRate)).condition( (A2 - (learningRate * decayRate)),
(A2 + (learningRate * decayRate)), (real)0.0)); (A2 < -(learningRate * decayRate))
.condition((A2 + (learningRate * decayRate)), (real)0.0));
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testUnayrCompareOp(Tensor& A1, Tensor& A2) { void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
testTensorClip(A1, A2); testTensorClip(A1, A2);
testTensorBiggerThanScalar(A1, A2); testTensorBiggerThanScalar(A1, A2);
...@@ -377,7 +379,7 @@ TEST(Unary, CompareOp) { ...@@ -377,7 +379,7 @@ TEST(Unary, CompareOp) {
#endif #endif
} }
template<typename Tensor> template <typename Tensor>
void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
real p1 = 2.5; real p1 = 2.5;
real p2 = 3.2; real p2 = 3.2;
...@@ -406,7 +408,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -406,7 +408,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
real p = 2.5; real p = 2.5;
A1.sub(B); // a -= b A1.sub(B); // a -= b
...@@ -422,7 +424,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -422,7 +424,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
real p = 2.5; real p = 2.5;
A1.mulScalar(B, p); // a = b * p A1.mulScalar(B, p); // a = b * p
...@@ -442,7 +444,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -442,7 +444,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
real p = 2.5; real p = 2.5;
A1.divScalar(B, p); // a = b / p A1.divScalar(B, p); // a = b / p
...@@ -454,28 +456,28 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -454,28 +456,28 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
A1.assign(B); // a = b A1.assign(B); // a = b
A2 = B; A2 = B;
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
B.square2(A1); // b = a * a B.square2(A1); // b = a * a
A2 = B.square(); A2 = B.square();
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
A1.squareDerivative(B); // a *= 2.0 * b A1.squareDerivative(B); // a *= 2.0 * b
A2 = A2 * (real)2.0 * B; A2 = A2 * (real)2.0 * B;
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
B.reciprocal2(A1); // b = 1.0f / a B.reciprocal2(A1); // b = 1.0f / a
A2 = B.reciprocal(); A2 = B.reciprocal();
...@@ -490,33 +492,33 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -490,33 +492,33 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
real learningRate = 0.7f; real learningRate = 0.7f;
real decayRate = 1.2f; real decayRate = 1.2f;
A1.applyL2(B, learningRate, decayRate); // a *= (1.0f / (1.0f + p * b)) A1.applyL2(B, learningRate, decayRate); // a *= (1.0f / (1.0f + p * b))
A2 *= (B.constant(1.0f) + A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B)
B.constant(learningRate * decayRate) * B).reciprocal(); .reciprocal();
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
A1.reciprocalDerivative(B); // a *= -b * b A1.reciprocalDerivative(B); // a *= -b * b
A2 *= (-B) * B; A2 *= (-B) * B;
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
B.sign2(A1); // b = a > 0.0f ? 1.0f : -1.0f B.sign2(A1); // b = a > 0.0f ? 1.0f : -1.0f
A2 = B.sign(); A2 = B.sign();
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
B.abs2(A1); // b = a > 0.0f ? a : -a B.abs2(A1); // b = a > 0.0f ? a : -a
A2 = B.abs(); A2 = B.abs();
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) { void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
testTensorAdd(A1, A2, B); testTensorAdd(A1, A2, B);
testTensorSub(A1, A2, B); testTensorSub(A1, A2, B);
...@@ -539,7 +541,7 @@ TEST(Binary, BaseOp) { ...@@ -539,7 +541,7 @@ TEST(Binary, BaseOp) {
#endif #endif
} }
template<typename Tensor> template <typename Tensor>
void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
// a = exp(b) // a = exp(b)
A1.exp2(B); A1.exp2(B);
...@@ -547,14 +549,14 @@ void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -547,14 +549,14 @@ void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
A1.expDerivative(B); // a *= b A1.expDerivative(B); // a *= b
A2 *= B; A2 *= B;
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
// a = log(b) // a = log(b)
A1.log2(B); A1.log2(B);
...@@ -562,7 +564,7 @@ void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -562,7 +564,7 @@ void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
// a = sqrt(b) // a = sqrt(b)
A1.sqrt2(B); A1.sqrt2(B);
...@@ -570,7 +572,7 @@ void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -570,7 +572,7 @@ void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
// a = 1.0f / sqrt(b) // a = 1.0f / sqrt(b)
A1.invSqrt(B); A1.invSqrt(B);
...@@ -578,14 +580,14 @@ void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -578,14 +580,14 @@ void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
A1.pow2(B, 2.5f); // a = pow(b, p) A1.pow2(B, 2.5f); // a = pow(b, p)
A2 = B.pow(2.5f); A2 = B.pow(2.5f);
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
/* /*
* const T THRESHOLD = 40.0; * const T THRESHOLD = 40.0;
...@@ -597,12 +599,14 @@ void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -597,12 +599,14 @@ void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
real THRESHOLD = 40.0; real THRESHOLD = 40.0;
A2 = (B.constant(1.0f) + A2 = (B.constant(1.0f) +
(B > THRESHOLD).condition( (B > THRESHOLD)
THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)).exp()).log(); .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))
.exp())
.log();
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
/* /*
* const T THRESHOLD = 40.0; * const T THRESHOLD = 40.0;
...@@ -612,14 +616,16 @@ void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -612,14 +616,16 @@ void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
*/ */
A1.softreluDerivative(B); A1.softreluDerivative(B);
real THRESHOLD = 40.0; real THRESHOLD = 40.0;
A2 = A2 * (B.constant(1.0f) - A2 = A2 *
(B.constant(-1.0f) * (B.constant(1.0f) -
(B > THRESHOLD).condition( (B.constant(-1.0f) *
THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))).exp()); (B > THRESHOLD)
.condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)))
.exp());
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
/* /*
const T THRESHOLD_MIN = -40.0; const T THRESHOLD_MIN = -40.0;
...@@ -632,46 +638,47 @@ void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -632,46 +638,47 @@ void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
const real THRESHOLD_MIN = -40.0; const real THRESHOLD_MIN = -40.0;
const real THRESHOLD_MAX = 13.0; const real THRESHOLD_MAX = 13.0;
auto tmp = (B < THRESHOLD_MIN).condition( auto tmp = (B < THRESHOLD_MIN)
THRESHOLD_MIN, (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B)); .condition(THRESHOLD_MIN,
(B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal(); A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
A1.sigmoidDerivative(B); // a *= b * (1 - b) A1.sigmoidDerivative(B); // a *= b * (1 - b)
A2 *= B * (B.constant(1.0f) - B); A2 *= B * (B.constant(1.0f) - B);
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
B.tanh(A1); // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0 B.tanh(A1); // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f; A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
A1.tanhDerivative(B); // a *= 1 - b * b A1.tanhDerivative(B); // a *= 1 - b * b
A2 *= B.constant(1.0f) - B * B; A2 *= B.constant(1.0f) - B * B;
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
real p1 = 2.5; real p1 = 2.5;
real p2 = 3.1; real p2 = 3.1;
// b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0) // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
B.scaledTanh(A1, p1, p2); B.scaledTanh(A1, p1, p2);
A2 = B.constant(p1) * A2 = B.constant(p1) *
(B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) -
- (real)1.0); (real)1.0);
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
real p1 = 2.5; real p1 = 2.5;
real p2 = 3.1; real p2 = 3.1;
...@@ -681,7 +688,7 @@ void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -681,7 +688,7 @@ void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) { void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
testTensorTanhDerivative(A1, A2, B); testTensorTanhDerivative(A1, A2, B);
testTensorScaledTanhDerivative(A1, A2, B); testTensorScaledTanhDerivative(A1, A2, B);
...@@ -708,21 +715,21 @@ TEST(Binary, MathOp) { ...@@ -708,21 +715,21 @@ TEST(Binary, MathOp) {
#endif #endif
} }
template<typename Tensor> template <typename Tensor>
void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
B.relu(A1); // b = a > 0.0f ? a : 0.0f B.relu(A1); // b = a > 0.0f ? a : 0.0f
A2 = (B > (real)0.0f).condition(B, (real)0.0f); A2 = (B > (real)0.0f).condition(B, (real)0.0f);
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
A1.reluDerivative(B); // a *= (b > 0.0f ? 1.0f : 0.0f) A1.reluDerivative(B); // a *= (b > 0.0f ? 1.0f : 0.0f)
A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0); A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
/* /*
* b = a > p1 ? a : p1 * b = a > p1 ? a : p1
...@@ -736,7 +743,7 @@ void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -736,7 +743,7 @@ void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
SetTensorValue(B, 32.0f); SetTensorValue(B, 32.0f);
/* /*
...@@ -748,15 +755,15 @@ void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -748,15 +755,15 @@ void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
A1.absDerivative(B); // a = (b > 0) ? a : (b < 0) ? -a : 0 A1.absDerivative(B); // a = (b > 0) ? a : (b < 0) ? -a : 0
A2 = (B > (real)0.0f).condition(A2, A2 = (B > (real)0.0f)
(B < (real)0.0f).condition(-A2, (real)0.0f)); .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f));
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
real p = 0.613; real p = 0.613;
SetTensorValue(B, p); SetTensorValue(B, p);
...@@ -765,7 +772,7 @@ void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -765,7 +772,7 @@ void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) { void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
/** /**
* T lambda = p * b; * T lambda = p * b;
...@@ -778,12 +785,13 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -778,12 +785,13 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
real decayRate = 0.6f; real decayRate = 0.6f;
A1.applyL1(B, learningRate, decayRate); A1.applyL1(B, learningRate, decayRate);
auto lambda = B.constant(learningRate * decayRate) * B; auto lambda = B.constant(learningRate * decayRate) * B;
A2 = (A2 > lambda).condition( A2 = (A2 > lambda)
(A2 - lambda), (A2 < -lambda).condition((A2 + lambda), (real)0.0f)); .condition((A2 - lambda),
(A2 < -lambda).condition((A2 + lambda), (real)0.0f));
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) { void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
B.subScalar(0.5f); B.subScalar(0.5f);
SetTensorValue(B, 0.0f); SetTensorValue(B, 0.0f);
...@@ -807,7 +815,7 @@ TEST(Binary, CompareOp) { ...@@ -807,7 +815,7 @@ TEST(Binary, CompareOp) {
#endif #endif
} }
template<typename Tensor> template <typename Tensor>
void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
A1.add(B, C); // a = b + c A1.add(B, C); // a = b + c
A2 = B + C; A2 = B + C;
...@@ -833,7 +841,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { ...@@ -833,7 +841,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
A1.sub(B, C); // a = b - c A1.sub(B, C); // a = b - c
A2 = B - C; A2 = B - C;
...@@ -846,7 +854,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { ...@@ -846,7 +854,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
A1.dotMul(B, C); // a = b * c A1.dotMul(B, C); // a = b * c
A2 = B * C; A2 = B * C;
...@@ -892,7 +900,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { ...@@ -892,7 +900,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
A1.dotDiv(B, C); // a = (b == 0.0) ? 0.0 : b / c A1.dotDiv(B, C); // a = (b == 0.0) ? 0.0 : b / c
A2 = (B == (real)0.0).condition((real)0.0, B / C); A2 = (B == (real)0.0).condition((real)0.0, B / C);
...@@ -905,7 +913,7 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { ...@@ -905,7 +913,7 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
real p1 = 1.5; real p1 = 1.5;
real p2 = 2.5; real p2 = 2.5;
...@@ -915,14 +923,14 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { ...@@ -915,14 +923,14 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
A1.softCrossEntropy(B, C); // a = -c * log(b) - (1 - c) * log(1 - b) A1.softCrossEntropy(B, C); // a = -c * log(b) - (1 - c) * log(1 - b)
A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log(); A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorSoftCrossEntropyBp(Tensor& A1, void testTensorSoftCrossEntropyBp(Tensor& A1,
Tensor& A2, Tensor& A2,
Tensor& B, Tensor& B,
...@@ -932,7 +940,7 @@ void testTensorSoftCrossEntropyBp(Tensor& A1, ...@@ -932,7 +940,7 @@ void testTensorSoftCrossEntropyBp(Tensor& A1,
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
testTensorAdd(A1, A2, B, C); testTensorAdd(A1, A2, B, C);
testTensorSub(A1, A2, B, C); testTensorSub(A1, A2, B, C);
...@@ -952,30 +960,30 @@ TEST(Ternary, BaseOp) { ...@@ -952,30 +960,30 @@ TEST(Ternary, BaseOp) {
#endif #endif
} }
template<typename Tensor> template <typename Tensor>
void testTensorBinaryLabelCrossEntropy(Tensor& A1, void testTensorBinaryLabelCrossEntropy(Tensor& A1,
Tensor& A2, Tensor& A2,
Tensor& B, Tensor& B,
Tensor& C) { Tensor& C) {
A1.binaryLabelCrossEntropy(B, C); // a = c > 0.5 ? -log(b) : -log(1.0 - b) A1.binaryLabelCrossEntropy(B, C); // a = c > 0.5 ? -log(b) : -log(1.0 - b)
A2 = (C > (real)0.5).condition( A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log()));
-(B.log()), -((B.constant(1.0f) - B).log()));
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorBinaryLabelCrossEntropyBp(Tensor& A1, void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
Tensor& A2, Tensor& A2,
Tensor& B, Tensor& B,
Tensor& C) { Tensor& C) {
// a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b) // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
A1.binaryLabelCrossEntropyBp(B, C); A1.binaryLabelCrossEntropyBp(B, C);
A2 += (C > (real)0.5).condition( A2 += (C > (real)0.5)
(B.constant(-1.0f) / B), (B.constant(1.0f) - B).reciprocal()); .condition((B.constant(-1.0f) / B),
(B.constant(1.0f) - B).reciprocal());
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorLogisticRegressionLoss(Tensor& A1, void testTensorLogisticRegressionLoss(Tensor& A1,
Tensor& A2, Tensor& A2,
Tensor& B, Tensor& B,
...@@ -991,13 +999,14 @@ void testTensorLogisticRegressionLoss(Tensor& A1, ...@@ -991,13 +999,14 @@ void testTensorLogisticRegressionLoss(Tensor& A1,
*/ */
A1.logisticRegressionLoss(B, C); A1.logisticRegressionLoss(B, C);
real THRESHOLD = 40.0; real THRESHOLD = 40.0;
auto tmp = (B > THRESHOLD).condition( auto tmp =
THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); (B > THRESHOLD)
.condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp; A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorLogisticRegressionLossBp(Tensor& A1, void testTensorLogisticRegressionLossBp(Tensor& A1,
Tensor& A2, Tensor& A2,
Tensor& B, Tensor& B,
...@@ -1013,28 +1022,29 @@ void testTensorLogisticRegressionLossBp(Tensor& A1, ...@@ -1013,28 +1022,29 @@ void testTensorLogisticRegressionLossBp(Tensor& A1,
*/ */
A1.logisticRegressionLossBp(B, C); A1.logisticRegressionLossBp(B, C);
real THRESHOLD = 40.0; real THRESHOLD = 40.0;
auto tmp = (B > THRESHOLD).condition( auto tmp =
THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); (B > THRESHOLD)
.condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
auto tmp2 = tmp.exp(); auto tmp2 = tmp.exp();
A2 = tmp2 / (C.constant(1.0) + tmp2) - C; A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
A1.biggerThan(B, C); // a = (b > c) ? 1.0f : 0.0f A1.biggerThan(B, C); // a = (b > c) ? 1.0f : 0.0f
A2 = (B > C).condition((real)1.0f, (real)0.0f); A2 = (B > C).condition((real)1.0f, (real)0.0f);
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
A1.max2(B, C); // a = (b > c) ? b : c A1.max2(B, C); // a = (b > c) ? b : c
A2 = (B > C).condition(B, C); A2 = (B > C).condition(B, C);
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C); testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
testTensorBinaryLabelCrossEntropy(A1, A2, B, C); testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
...@@ -1053,12 +1063,9 @@ TEST(Ternary, CompareOp) { ...@@ -1053,12 +1063,9 @@ TEST(Ternary, CompareOp) {
#endif #endif
} }
template<typename Tensor> template <typename Tensor>
void testQuaternaryAdd(Tensor& A1, void testQuaternaryAdd(
Tensor& A2, Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
Tensor& B,
Tensor& C,
Tensor& D) {
// A1.add3(B, C, D, 1.5f, 2.5f, 3.5f); // a = p1 * b + p2 * c + p3 * d // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f); // a = p1 * b + p2 * c + p3 * d
// A2 = B * 1.5f + C * 2.5f + D * 3.5f; // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
// TensorCheckEqual(A1, A2); // TensorCheckEqual(A1, A2);
...@@ -1084,25 +1091,19 @@ TEST(Quaternary, BaseOp) { ...@@ -1084,25 +1091,19 @@ TEST(Quaternary, BaseOp) {
#endif #endif
} }
template<typename Tensor> template <typename Tensor>
void testTensorBiggerThan(Tensor& A1, void testTensorBiggerThan(
Tensor& A2, Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
Tensor& B,
Tensor& C,
Tensor& D) {
// a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f); // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
A1.biggerThan(B, C, D); A1.biggerThan(B, C, D);
A2 = ((B > C && D > (real)0.5) A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5))
|| (B < C && D < (real)0.5)).condition((real)1.0, (real)0.0); .condition((real)1.0, (real)0.0);
TensorCheckEqual(A1, A2); TensorCheckEqual(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorRankLoss(Tensor& A1, void testTensorRankLoss(
Tensor& A2, Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
Tensor& B,
Tensor& C,
Tensor& D) {
/** /**
* const T THRESHOLD = 40.0; a = b - c; * const T THRESHOLD = 40.0; a = b - c;
* a = (a > THRESHOLD) * a = (a > THRESHOLD)
...@@ -1114,19 +1115,17 @@ void testTensorRankLoss(Tensor& A1, ...@@ -1114,19 +1115,17 @@ void testTensorRankLoss(Tensor& A1,
real THRESHOLD = 40.0; real THRESHOLD = 40.0;
auto tmp = B - C; auto tmp = B - C;
auto tmp2 = (tmp > THRESHOLD).condition( auto tmp2 =
THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); (tmp > THRESHOLD)
.condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D; A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testTensorRankLossBp(Tensor& A1, void testTensorRankLossBp(
Tensor& A2, Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
Tensor& B,
Tensor& C,
Tensor& D) {
/** /**
* const T THRESHOLD = 40.0; a = b - c; * const T THRESHOLD = 40.0; a = b - c;
* a = (a > THRESHOLD) * a = (a > THRESHOLD)
...@@ -1137,20 +1136,18 @@ void testTensorRankLossBp(Tensor& A1, ...@@ -1137,20 +1136,18 @@ void testTensorRankLossBp(Tensor& A1,
A1.rankLossBp(B, C, D); A1.rankLossBp(B, C, D);
real THRESHOLD = 40.0; real THRESHOLD = 40.0;
auto tmp = B - C; auto tmp = B - C;
auto tmp2 = (tmp > THRESHOLD).condition( auto tmp2 =
THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); (tmp > THRESHOLD)
.condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
auto tmp3 = tmp2.exp(); auto tmp3 = tmp2.exp();
A2 = tmp3 / (D.constant(1.0f) + tmp3) - D; A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
template<typename Tensor> template <typename Tensor>
void testQuaternaryCompareOp(Tensor& A1, void testQuaternaryCompareOp(
Tensor& A2, Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
Tensor& B,
Tensor& C,
Tensor& D) {
testTensorBiggerThan(A1, A2, B, C, D); testTensorBiggerThan(A1, A2, B, C, D);
testTensorRankLoss(A1, A2, B, C, D); testTensorRankLoss(A1, A2, B, C, D);
testTensorRankLossBp(A1, A2, B, C, D); testTensorRankLossBp(A1, A2, B, C, D);
......
...@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "PerfUtils.h"
#include "TensorCheck.h"
#include "paddle/math/Matrix.h" #include "paddle/math/Matrix.h"
#include "paddle/math/TensorAssign.h" #include "paddle/math/TensorAssign.h"
#include "TensorCheck.h"
#include "PerfUtils.h"
using paddle::BaseMatrix; using paddle::BaseMatrix;
using paddle::CpuMatrix; using paddle::CpuMatrix;
...@@ -27,14 +27,28 @@ using autotest::TensorCheckErr; ...@@ -27,14 +27,28 @@ using autotest::TensorCheckErr;
typedef std::function<void(int height, int width)> testMatrixFunc; typedef std::function<void(int height, int width)> testMatrixFunc;
void testMatrixCase(testMatrixFunc matrixFunc) { void testMatrixCase(testMatrixFunc matrixFunc) {
for (auto height : {1}) { for (auto height : {1}) {
for (auto width : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072, for (auto width : {1,
262144, 524288, 1048576, 2097152, 4194304, 8388608}) { 32,
64,
128,
512,
1024,
4096,
32768,
65536,
131072,
262144,
524288,
1048576,
2097152,
4194304,
8388608}) {
matrixFunc(height, width); matrixFunc(height, width);
} }
} }
} }
template<typename Tensor> template <typename Tensor>
void testLazyAssign(int height, int width) { void testLazyAssign(int height, int width) {
Tensor A1(height, width); Tensor A1(height, width);
Tensor A2(height, width); Tensor A2(height, width);
...@@ -49,40 +63,39 @@ void testLazyAssign(int height, int width) { ...@@ -49,40 +63,39 @@ void testLazyAssign(int height, int width) {
EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;); EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
EXPRESSION_PERFORMANCE( EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C);
auto expr1 = A2.lazyAssign(B + C); auto expr2 = A2.lazyAssign(A2 * D);
auto expr2 = A2.lazyAssign(A2 * D); AssignEvaluate(expr1, expr2););
AssignEvaluate(expr1, expr2););
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
} }
TEST(lazyAssign, CPU) { TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
testMatrixCase(testLazyAssign<CpuMatrix>);
}
#ifndef PADDLE_ONLY_CPU #ifndef PADDLE_ONLY_CPU
TEST(lazyAssign, GPU) { TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
testMatrixCase(testLazyAssign<GpuMatrix>);
}
#endif #endif
template<typename Tensor> template <typename Tensor>
void sgdUpdateTensor(Tensor& A, Tensor& B, Tensor& C, Tensor& D, void sgdUpdateTensor(
real p1, real p2, real p3) { Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) {
C = C * p2 - D * (B + A * p3) * p1; C = C * p2 - D * (B + A * p3) * p1;
A += C; A += C;
} }
void sgdUpdateLazyAssign(BaseMatrix& A, BaseMatrix& B, void sgdUpdateLazyAssign(BaseMatrix& A,
BaseMatrix& C, BaseMatrix& D, BaseMatrix& B,
real p1, real p2, real p3) { BaseMatrix& C,
BaseMatrix& D,
real p1,
real p2,
real p3) {
auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1); auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
auto expr2 = A.lazyAssign(A + C); auto expr2 = A.lazyAssign(A + C);
AssignEvaluate(expr1, expr2); AssignEvaluate(expr1, expr2);
} }
template<typename Tensor> template <typename Tensor>
void testSgdUpdate(int height, int width) { void testSgdUpdate(int height, int width) {
Tensor A1(height, width); Tensor A1(height, width);
Tensor A2(height, width); Tensor A2(height, width);
...@@ -113,16 +126,13 @@ void testSgdUpdate(int height, int width) { ...@@ -113,16 +126,13 @@ void testSgdUpdate(int height, int width) {
* a = a + c; * a = a + c;
*/ */
// BaseMatrix API // BaseMatrix API
EXPRESSION_PERFORMANCE( EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3););
A1.sgdUpdate(B, C1, D, p1, p2, p3););
// Tensor expression // Tensor expression
EXPRESSION_PERFORMANCE( EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
// lazyAssign // lazyAssign
EXPRESSION_PERFORMANCE( EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
TensorCheckErr(A1, A2); TensorCheckErr(A1, A2);
TensorCheckErr(A1, A3); TensorCheckErr(A1, A3);
...@@ -130,12 +140,8 @@ void testSgdUpdate(int height, int width) { ...@@ -130,12 +140,8 @@ void testSgdUpdate(int height, int width) {
TensorCheckErr(C1, C3); TensorCheckErr(C1, C3);
} }
TEST(sgdUpdate, CPU) { TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
testMatrixCase(testSgdUpdate<CpuMatrix>);
}
#ifndef PADDLE_ONLY_CPU #ifndef PADDLE_ONLY_CPU
TEST(sgdUpdate, GPU) { TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
testMatrixCase(testSgdUpdate<GpuMatrix>);
}
#endif #endif
...@@ -79,8 +79,8 @@ void testMatrixMaxSequence(int batchSize, int inputDim) { ...@@ -79,8 +79,8 @@ void testMatrixMaxSequence(int batchSize, int inputDim) {
} }
TEST(Matrix, maxSequence) { TEST(Matrix, maxSequence) {
for (auto batchSize : {1, 10, 128, 1000, 6000}) { for (auto batchSize : {1, 3, 997}) { // prime numbers close to 1, 4, 1024
for (auto inputDim : {1, 32, 100, 512}) { for (auto inputDim : {1, 7, 131}) { // prime numbers close to 1, 8, 128
VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim; VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
testMatrixMaxSequence(batchSize, inputDim); testMatrixMaxSequence(batchSize, inputDim);
} }
...@@ -240,14 +240,10 @@ TEST(Matrix, unary) { ...@@ -240,14 +240,10 @@ TEST(Matrix, unary) {
// inverse matrix // inverse matrix
testMatrixInverse(height); testMatrixInverse(height);
#else #else
LOG(WARNING) << "Cannot run Matrix Inverse Unit Test.\n" LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
<< "Failed to find lapack library in current system.\n" << "support so we cannot test matrix inverse. To test "
<< "To address this issue, Please adopt one of the following " << "matrix inverse, please install LAPACKE "
"approaches: \n" << "and MKL/Openblas/ATLAS, and re-build PaddlePaddle.";
<< "1. Simply issue `sudo apt-get install liblapacke-dev` to "
"avoid re-build source code. \n"
<< "2. Install MKL/Openblas/ATLAS and re-build PaddlePaddle "
"source code.";
#endif #endif
} }
} }
...@@ -341,8 +337,8 @@ void testMatrixSoftmaxBp(int height, int width) { ...@@ -341,8 +337,8 @@ void testMatrixSoftmaxBp(int height, int width) {
} }
TEST(Matrix, softmax) { TEST(Matrix, softmax) {
for (auto height : {1, 11, 73, 128, 200}) { for (auto height : {1, 3, 131}) { // prime numbers close to 1, 4, 127
for (auto width : {1, 32, 100, 512, 1000}) { for (auto width : {1, 17, 251}) { // prime numbers close to 1, 16, 256
VLOG(3) << " height=" << height << " width=" << width; VLOG(3) << " height=" << height << " width=" << width;
testMatrixSoftmax(height, width); testMatrixSoftmax(height, width);
...@@ -527,7 +523,7 @@ void testVectorRowFunc(int size) { ...@@ -527,7 +523,7 @@ void testVectorRowFunc(int size) {
} }
TEST(Vector, rowFunc) { TEST(Vector, rowFunc) {
for (auto size : {1, 5, 31, 90, 150, 500, 1000, 4000}) { for (auto size : {1, 3, 997}) { // prime numbers close to 1, 4, 1024
VLOG(3) << " size=" << size; VLOG(3) << " size=" << size;
testVectorRowFunc(size); testVectorRowFunc(size);
} }
...@@ -604,7 +600,7 @@ void testVectorIsEqual(int size) { ...@@ -604,7 +600,7 @@ void testVectorIsEqual(int size) {
} }
TEST(Vector, Equal) { TEST(Vector, Equal) {
for (auto size : {1, 5, 31, 90, 150, 500, 1000, 4000}) { for (auto size : {1, 3, 997}) { // prime numbers close to 1, 4, 1024
VLOG(3) << " size=" << size; VLOG(3) << " size=" << size;
testVectorReset<int>(size); testVectorReset<int>(size);
testVectorReset<real>(size); testVectorReset<real>(size);
...@@ -635,9 +631,8 @@ void testMatrixTopK(int samples, int dim, int beamSize) { ...@@ -635,9 +631,8 @@ void testMatrixTopK(int samples, int dim, int beamSize) {
} }
TEST(Matrix, topK) { TEST(Matrix, topK) {
for (auto samples : {1, 5, 31, 90, 150, 500}) { for (auto samples : {1, 17, 131}) { // prime numbers close to 1, 16, 127
for (auto dim : for (auto dim : {1, 3, 997}) { // prime numbers close to 1, 4, 1024
{1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) {
for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) { for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
if (beamSize > dim) continue; if (beamSize > dim) continue;
VLOG(3) << " samples=" << samples << " beamSize=" << beamSize VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
...@@ -650,6 +645,7 @@ TEST(Matrix, topK) { ...@@ -650,6 +645,7 @@ TEST(Matrix, topK) {
void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) { void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
int nnz = samples * dim * ratio; int nnz = samples * dim * ratio;
if (nnz < 1) nnz = 1; // Because sparseRand in MathUtil.cpp requires this.
MatrixPtr cpuSrc = std::make_shared<CpuSparseMatrix>(samples, dim, nnz); MatrixPtr cpuSrc = std::make_shared<CpuSparseMatrix>(samples, dim, nnz);
MatrixPtr gpuSrc = std::make_shared<GpuSparseMatrix>(samples, dim, nnz); MatrixPtr gpuSrc = std::make_shared<GpuSparseMatrix>(samples, dim, nnz);
MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize); MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
...@@ -683,9 +679,9 @@ void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) { ...@@ -683,9 +679,9 @@ void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
} }
TEST(SMatrix, topK) { TEST(SMatrix, topK) {
for (auto samples : {1, 5, 100}) { for (auto samples : {1, 3, 61}) {
for (auto dim : {10000, 10000, 50000}) { for (auto dim : {1, 3, 61}) {
for (auto beamSize : {1, 5, 40, 100, 500}) { for (auto beamSize : {1, 3, 61}) {
for (auto ratio : {0.01, 0.001}) { for (auto ratio : {0.01, 0.001}) {
if (beamSize > dim) continue; if (beamSize > dim) continue;
VLOG(3) << " samples=" << samples << " beamSize=" << beamSize VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
...@@ -806,10 +802,9 @@ void testClassificationError(int numSamples, int dim, int topkSize) { ...@@ -806,10 +802,9 @@ void testClassificationError(int numSamples, int dim, int topkSize) {
} }
TEST(Matrix, classificationError) { TEST(Matrix, classificationError) {
for (auto numSamples : {1, 5, 31, 90, 150, 300}) { for (auto numSamples : {1, 3, 31}) {
for (auto dim : for (auto dim : {1, 3, 31}) {
{1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) { for (auto topkSize : {1, 3, (int)rand() % dim + 1}) {
for (auto topkSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
if (topkSize > dim) continue; if (topkSize > dim) continue;
VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
<< " dim= " << dim; << " dim= " << dim;
...@@ -1016,13 +1011,15 @@ void testAvgPoolFwdBwd(int numSamples, ...@@ -1016,13 +1011,15 @@ void testAvgPoolFwdBwd(int numSamples,
TensorCheckErr(*inputGrad, *inputGpuGrad); TensorCheckErr(*inputGrad, *inputGpuGrad);
} }
// TODO(yi): I noticed many such blindly combinatorial tests in this
// file. They are no help to locate defects at all.
TEST(Matrix, PoolFwdBwd) { TEST(Matrix, PoolFwdBwd) {
for (auto numSamples : {5, 32}) { for (auto numSamples : {1, 3}) {
for (auto channels : {1, 9, 32}) { for (auto channels : {1, 3}) {
for (auto imgSizeH : {14, 28}) { for (auto imgSizeH : {13, 17}) {
for (auto imgSizeW : {16, 30}) { for (auto imgSizeW : {17, 19}) {
for (auto sizeX : {2, 5}) { for (auto sizeX : {2, 3}) {
for (auto sizeY : {2, 5}) { for (auto sizeY : {2, 3}) {
for (auto sH : {1, 2}) { for (auto sH : {1, 2}) {
for (auto sW : {1, 2}) { for (auto sW : {1, 2}) {
for (auto pH : {0, (sizeY - 1) / 2}) { for (auto pH : {0, (sizeY - 1) / 2}) {
...@@ -1128,8 +1125,8 @@ TEST(Matrix, MaxOutFwdBwd) { ...@@ -1128,8 +1125,8 @@ TEST(Matrix, MaxOutFwdBwd) {
} }
TEST(CpuMatrix, copyFrom) { TEST(CpuMatrix, copyFrom) {
const size_t height = 1000; const size_t height = 31;
const size_t width = 1000; const size_t width = 53;
CpuMatrix cpu(height, width); CpuMatrix cpu(height, width);
GpuMatrix gpu(height, width); GpuMatrix gpu(height, width);
CpuMatrix copy(height, width); CpuMatrix copy(height, width);
...@@ -1149,6 +1146,10 @@ void testBatch2seqPadding(int batchSize, int inputDim) { ...@@ -1149,6 +1146,10 @@ void testBatch2seqPadding(int batchSize, int inputDim) {
IVectorPtr cpuSequence; IVectorPtr cpuSequence;
generateSequenceStartPositions(batchSize, cpuSequence); generateSequenceStartPositions(batchSize, cpuSequence);
for (int i = 0; i < int(cpuSequence->getSize()); ++i) {
(cpuSequence->getData())[i] += 1; // so no way that maxSeqLen is 0;
}
IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true); IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
gpuSequence->copyFrom(*cpuSequence); gpuSequence->copyFrom(*cpuSequence);
...@@ -1156,45 +1157,46 @@ void testBatch2seqPadding(int batchSize, int inputDim) { ...@@ -1156,45 +1157,46 @@ void testBatch2seqPadding(int batchSize, int inputDim) {
size_t maxSeqLen = *std::max_element(cpuSequence->getData(), size_t maxSeqLen = *std::max_element(cpuSequence->getData(),
cpuSequence->getData() + numSeq); cpuSequence->getData() + numSeq);
printf("numSeq = %ld, maxSeqLen = %ld\n", numSeq, maxSeqLen);
MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim); MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim); MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim);
MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim); MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
hl_sequence2batch_copy_padding(gBatch->getData(), // hl_sequence2batch_copy_padding(gBatch->getData(),
gpuInput->getData(), // gpuInput->getData(),
cpuSequence->getData(), // cpuSequence->getData(),
inputDim, // inputDim,
maxSeqLen, // maxSeqLen,
numSeq, // numSeq,
false, // false,
true); // true);
cCheck->copyFrom(*gBatch); // cCheck->copyFrom(*gBatch);
int* seqStart = cpuSequence->getData(); // int* seqStart = cpuSequence->getData();
float* batchData = cBatch->getData(); // float* batchData = cBatch->getData();
float* seqData = cpuInput->getData(); // float* seqData = cpuInput->getData();
for (size_t i = 0; i < maxSeqLen; i++) { // for (size_t i = 0; i < maxSeqLen; i++) {
for (size_t j = 0; j < numSeq; j++) { // for (size_t j = 0; j < numSeq; j++) {
size_t sequenceStart = seqStart[j]; // size_t sequenceStart = seqStart[j];
size_t sequenceLength = seqStart[j + 1] - seqStart[j]; // size_t sequenceLength = seqStart[j + 1] - seqStart[j];
if (i < sequenceLength) { // if (i < sequenceLength) {
memcpy(batchData + (i * numSeq + j) * inputDim, // memcpy(batchData + (i * numSeq + j) * inputDim,
seqData + (sequenceStart + i) * inputDim, // seqData + (sequenceStart + i) * inputDim,
inputDim * sizeof(real)); // inputDim * sizeof(real));
} else { // } else {
memset(batchData + (i * numSeq + j) * inputDim, // memset(batchData + (i * numSeq + j) * inputDim,
0, // 0,
inputDim * sizeof(real)); // inputDim * sizeof(real));
} // }
} // }
} // }
TensorCheckErr(*cBatch, *cCheck); // TensorCheckErr(*cBatch, *cCheck);
} }
TEST(Matrix, warpCTC) { TEST(Matrix, warpCTC) {
for (auto batchSize : {51, 526, 2884}) { for (auto batchSize : {1, 3, 17}) {
for (auto inputDim : {32, 512, 2026}) { for (auto inputDim : {1, 3, 31}) {
VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim; VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
testBatch2seqPadding(batchSize, inputDim); testBatch2seqPadding(batchSize, inputDim);
} }
......
---
Language: Cpp
BasedOnStyle: Google
Standard: Cpp11
...
...@@ -63,5 +63,6 @@ op_library(sgd_op SRCS sgd_op.cc sgd_op.cu) ...@@ -63,5 +63,6 @@ op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
op_library(fc_op op_library(fc_op
SRCS fc_op.cc SRCS fc_op.cc
DEPS mul_op rowwise_add_op sigmoid_op softmax_op net_op) DEPS mul_op rowwise_add_op sigmoid_op softmax_op net_op)
op_library(recurrent_op SRCS recurrent_op.cc DEPS op_desc tensor op_registry operator net_op) op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
DEPS op_desc tensor op_registry operator net_op)
cc_test(recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op) cc_test(recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op)
...@@ -18,10 +18,10 @@ namespace paddle { ...@@ -18,10 +18,10 @@ namespace paddle {
namespace operators { namespace operators {
class AddOp : public OperatorWithKernel { class AddOp : public OperatorWithKernel {
protected: protected:
void InferShape(const InferShapeContext &ctx) const override { void InferShape(const InferShapeContext &ctx) const override {
PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of AddOp must be two"); PADDLE_ENFORCE_EQ(ctx.InputSize(), 2);
PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of AddOp must be one"); PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1);
PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.InputVar(1) != nullptr, PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.InputVar(1) != nullptr,
"Inputs of AddOp must all be set"); "Inputs of AddOp must all be set");
PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr, PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr,
...@@ -33,7 +33,7 @@ protected: ...@@ -33,7 +33,7 @@ protected:
}; };
class AddOpMaker : public OpProtoAndCheckerMaker { class AddOpMaker : public OpProtoAndCheckerMaker {
public: public:
AddOpMaker(OpProto *proto, OpAttrChecker *op_checker) AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The first input of add op"); AddInput("X", "The first input of add op");
...@@ -48,7 +48,7 @@ The equation is: Out = X + Y ...@@ -48,7 +48,7 @@ The equation is: Out = X + Y
}; };
class AddOpGrad : public OperatorWithKernel { class AddOpGrad : public OperatorWithKernel {
protected: protected:
void InferShape(const InferShapeContext &ctx) const override {} void InferShape(const InferShapeContext &ctx) const override {}
}; };
......
...@@ -20,7 +20,7 @@ namespace operators { ...@@ -20,7 +20,7 @@ namespace operators {
template <typename Place, typename T> template <typename Place, typename T>
class AddKernel : public OpKernel { class AddKernel : public OpKernel {
public: public:
void Compute(const ExecutionContext& context) const override { void Compute(const ExecutionContext& context) const override {
auto input0 = context.Input<Tensor>(0); auto input0 = context.Input<Tensor>(0);
auto input1 = context.Input<Tensor>(1); auto input1 = context.Input<Tensor>(1);
......
...@@ -18,7 +18,7 @@ namespace paddle { ...@@ -18,7 +18,7 @@ namespace paddle {
namespace operators { namespace operators {
class OnehotCrossEntropyOp : public OperatorWithKernel { class OnehotCrossEntropyOp : public OperatorWithKernel {
protected: protected:
void InferShape(const InferShapeContext &ctx) const override { void InferShape(const InferShapeContext &ctx) const override {
PADDLE_ENFORCE(ctx.InputSize() == 2, PADDLE_ENFORCE(ctx.InputSize() == 2,
"Input size of OnehotCrossEntropyOp must be two"); "Input size of OnehotCrossEntropyOp must be two");
...@@ -36,8 +36,19 @@ protected: ...@@ -36,8 +36,19 @@ protected:
} }
}; };
class OnehotCrossEntropyGradientOp : public OperatorWithKernel {
protected:
void InferShape(const InferShapeContext &ctx) const override {
auto X_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto X = ctx.Input<Tensor>("X");
// TODO(superjom) add enforce here after helper functions ready
X_grad->Resize(X->dims());
}
};
class OnehotCrossEntropyOpMaker : public OpProtoAndCheckerMaker { class OnehotCrossEntropyOpMaker : public OpProtoAndCheckerMaker {
public: public:
OnehotCrossEntropyOpMaker(OpProto *proto, OpAttrChecker *op_checker) OnehotCrossEntropyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The first input of OnehotCrossEntropyOp"); AddInput("X", "The first input of OnehotCrossEntropyOp");
...@@ -54,8 +65,11 @@ OnehotCrossEntropy Operator. ...@@ -54,8 +65,11 @@ OnehotCrossEntropy Operator.
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP(onehot_cross_entropy, REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
ops::OnehotCrossEntropyOp,
ops::OnehotCrossEntropyOpMaker); ops::OnehotCrossEntropyOpMaker);
REGISTER_OP_CPU_KERNEL(onehot_cross_entropy, REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
ops::OnehotCrossEntropyOpKernel<ops::CPUPlace, float>); ops::OnehotCrossEntropyOpKernel<ops::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
onehot_cross_entropy_grad,
ops::OnehotCrossEntropyGradientOpKernel<ops::CPUPlace, float>);
...@@ -14,6 +14,3 @@ ...@@ -14,6 +14,3 @@
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "paddle/operators/cross_entropy_op.h" #include "paddle/operators/cross_entropy_op.h"
REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
ops::OnehotCrossEntropyOpKernel<ops::GPUPlace, float>);
...@@ -18,28 +18,68 @@ limitations under the License. */ ...@@ -18,28 +18,68 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
T tolerable_value(T x) {
static_assert(std::is_floating_point<T>::value,
"tolerable_value works only on float, "
"double and double double.");
const T kApproInf = 1e20;
if (x == INFINITY) {
return kApproInf;
}
if (x == -INFINITY) {
return -kApproInf;
}
return x;
}
template <typename Place, typename T> template <typename Place, typename T>
class OnehotCrossEntropyOpKernel : public OpKernel { class OnehotCrossEntropyOpKernel : public OpKernel {
public: public:
constexpr T LOG_THRESHOLD() const { return static_cast<T>(1e-20); }
void Compute(const ExecutionContext& ctx) const override { void Compute(const ExecutionContext& ctx) const override {
auto X = ctx.Input<Tensor>(0); auto X = ctx.Input<Tensor>("X");
const T* X_data = X->data<T>(); const T* Xdata = X->data<T>();
const int* label_data = ctx.Input<Tensor>(1)->data<int>(); const int* label_data = ctx.Input<Tensor>(1)->data<int>();
auto Y = ctx.Output<Tensor>(0); auto Y = ctx.Output<Tensor>("Y");
Y->mutable_data<T>(ctx.GetPlace()); Y->mutable_data<T>(ctx.GetPlace());
T* Y_data = Y->data<T>(); T* Ydata = Y->data<T>();
int batch_size = X->dims()[0]; int batch_size = X->dims()[0];
int class_num = X->dims()[1]; int class_num = X->dims()[1];
// Y[i] = -log(X[i][j])
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
Y_data[i] = -std::log( int index = i * class_num + label_data[i];
std::max(X_data[i * class_num + label_data[i]], LOG_THRESHOLD())); Ydata[i] = -tolerable_value(std::log(Xdata[index]));
}
}
};
template <typename Place, typename T>
class OnehotCrossEntropyGradientOpKernel : public OpKernel {
public:
void Compute(const ExecutionContext& ctx) const override {
auto X = ctx.Input<Tensor>("X");
auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto label = ctx.Input<Tensor>("label");
auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace());
auto* dYdata = dY->template data<T>();
auto* Xdata = X->template data<T>();
auto* label_data = label->data<int>();
const int batch_size = X->dims()[0];
const int class_num = X->dims()[1];
for (int i = 0; i < batch_size; ++i) {
int index = i * class_num + label_data[i];
dXdata[index] = -tolerable_value(dYdata[i] / Xdata[index]);
} }
} }
}; };
......
...@@ -18,31 +18,29 @@ namespace paddle { ...@@ -18,31 +18,29 @@ namespace paddle {
namespace operators { namespace operators {
class FullyConnectedOp : public NetOp { class FullyConnectedOp : public NetOp {
public: public:
void Init() override { void Init() override {
AddOp(OpRegistry::CreateOp("mul", AddOp(OpRegistry::CreateOp("mul",
{ {
Input("X"), Input("W"), Input("X"), Input("W"),
}, },
{Output("before_act")}, {Output("before_act")}, {}));
{}));
auto b = Input("b"); auto b = Input("b");
if (b != EMPTY_VAR_NAME()) { if (b != framework::kEmptyVarName) {
AddOp(OpRegistry::CreateOp("rowwise_add", AddOp(OpRegistry::CreateOp("rowwise_add",
{Output("before_act"), Input("b")}, {Output("before_act"), Input("b")},
{Output("before_act")}, {Output("before_act")}, {}));
{}));
} }
auto activation = GetAttr<std::string>("activation"); auto activation = GetAttr<std::string>("activation");
AddOp(OpRegistry::CreateOp( AddOp(OpRegistry::CreateOp(activation, {Output("before_act")},
activation, {Output("before_act")}, {Output("Y")}, {})); {Output("Y")}, {}));
CompleteAddOp(false); CompleteAddOp(false);
} }
}; };
class FullyConnectedOpMaker : public OpProtoAndCheckerMaker { class FullyConnectedOpMaker : public OpProtoAndCheckerMaker {
public: public:
FullyConnectedOpMaker(OpProto *proto, OpAttrChecker *op_checker) FullyConnectedOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "the input of fc operator"); AddInput("X", "the input of fc operator");
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class FillZerosLikeOp : public OperatorWithKernel { class FillZerosLikeOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(const framework::InferShapeContext &ctx) const override { void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE(ctx.InputSize() == 1UL, PADDLE_ENFORCE(ctx.InputSize() == 1UL,
"Input size of FillZerosLikeOp must be one."); "Input size of FillZerosLikeOp must be one.");
...@@ -33,8 +33,8 @@ protected: ...@@ -33,8 +33,8 @@ protected:
} }
}; };
class FillZerosLikeOpMaker : public OpProtoAndCheckerMaker { class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
FillZerosLikeOpMaker(framework::OpProto *proto, FillZerosLikeOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker) framework::OpAttrChecker *op_checker)
: framework::OpProtoAndCheckerMaker(proto, op_checker) { : framework::OpProtoAndCheckerMaker(proto, op_checker) {
...@@ -50,8 +50,7 @@ The output will have the same size with input. ...@@ -50,8 +50,7 @@ The output will have the same size with input.
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP(fill_zeros_like, REGISTER_OP(fill_zeros_like, paddle::operators::FillZerosLikeOp,
paddle::operators::FillZerosLikeOp,
paddle::operators::FillZerosLikeOpMaker); paddle::operators::FillZerosLikeOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
fill_zeros_like, fill_zeros_like,
......
...@@ -20,7 +20,7 @@ namespace operators { ...@@ -20,7 +20,7 @@ namespace operators {
template <typename Place, typename T> template <typename Place, typename T>
class FillZerosLikeKernel : public framework::OpKernel { class FillZerosLikeKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* output = context.Output<framework::Tensor>(0); auto* output = context.Output<framework::Tensor>(0);
output->mutable_data<T>(context.GetPlace()); output->mutable_data<T>(context.GetPlace());
......
...@@ -18,7 +18,7 @@ namespace paddle { ...@@ -18,7 +18,7 @@ namespace paddle {
namespace operators { namespace operators {
class MeanOp : public OperatorWithKernel { class MeanOp : public OperatorWithKernel {
protected: protected:
void InferShape(const InferShapeContext &ctx) const override { void InferShape(const InferShapeContext &ctx) const override {
PADDLE_ENFORCE(ctx.InputSize() == 1, "Input size of AddOp must be one"); PADDLE_ENFORCE(ctx.InputSize() == 1, "Input size of AddOp must be one");
PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of AddOp must be one"); PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of AddOp must be one");
...@@ -29,7 +29,7 @@ protected: ...@@ -29,7 +29,7 @@ protected:
}; };
class MeanOpMaker : public OpProtoAndCheckerMaker { class MeanOpMaker : public OpProtoAndCheckerMaker {
public: public:
MeanOpMaker(OpProto *proto, OpAttrChecker *op_checker) MeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input of mean op"); AddInput("X", "The input of mean op");
...@@ -39,9 +39,9 @@ public: ...@@ -39,9 +39,9 @@ public:
}; };
class MeanGradOp : public OperatorWithKernel { class MeanGradOp : public OperatorWithKernel {
protected: protected:
void InferShape(const InferShapeContext &ctx) const override { void InferShape(const InferShapeContext &ctx) const override {
ctx.Output<Tensor>("X" + GRAD_VAR_SUFFIX()) ctx.Output<Tensor>("X" + framework::kGradVarSuffix)
->Resize(ctx.Input<Tensor>("X")->dims()); ->Resize(ctx.Input<Tensor>("X")->dims());
} }
}; };
......
...@@ -20,7 +20,7 @@ namespace operators { ...@@ -20,7 +20,7 @@ namespace operators {
template <typename Place, typename T> template <typename Place, typename T>
class MeanKernel : public OpKernel { class MeanKernel : public OpKernel {
public: public:
void Compute(const ExecutionContext& context) const override { void Compute(const ExecutionContext& context) const override {
auto input = context.Input<Tensor>(0); auto input = context.Input<Tensor>(0);
auto output = context.Output<Tensor>(0); auto output = context.Output<Tensor>(0);
...@@ -37,12 +37,12 @@ public: ...@@ -37,12 +37,12 @@ public:
template <typename Place, typename T> template <typename Place, typename T>
class MeanGradKernel : public OpKernel { class MeanGradKernel : public OpKernel {
public: public:
void Compute(const ExecutionContext& context) const override { void Compute(const ExecutionContext& context) const override {
auto OG = context.Input<Tensor>("Out" + OperatorBase::GRAD_VAR_SUFFIX()); auto OG = context.Input<Tensor>("Out" + framework::kGradVarSuffix);
PADDLE_ENFORCE(framework::product(OG->dims()) == 1, PADDLE_ENFORCE(framework::product(OG->dims()) == 1,
"Mean Gradient should be scalar"); "Mean Gradient should be scalar");
auto IG = context.Output<Tensor>("X" + OperatorBase::GRAD_VAR_SUFFIX()); auto IG = context.Output<Tensor>("X" + framework::kGradVarSuffix);
IG->mutable_data<T>(context.GetPlace()); IG->mutable_data<T>(context.GetPlace());
T ig_size = (T)framework::product(IG->dims()); T ig_size = (T)framework::product(IG->dims());
......
...@@ -18,23 +18,27 @@ namespace paddle { ...@@ -18,23 +18,27 @@ namespace paddle {
namespace operators { namespace operators {
class MulOp : public OperatorWithKernel { class MulOp : public OperatorWithKernel {
protected: protected:
void InferShape(const InferShapeContext &ctx) const override { void InferShape(const InferShapeContext &ctx) const override {
PADDLE_ENFORCE(ctx.InputSize() == 2, "The mul op must take two inputs"); PADDLE_ENFORCE(ctx.InputSize() == 2, "The mul op must take two inputs");
auto dim0 = ctx.Input<Tensor>(0)->dims(); auto dim0 = ctx.Input<Tensor>(0)->dims();
auto dim1 = ctx.Input<Tensor>(1)->dims(); auto dim1 = ctx.Input<Tensor>(1)->dims();
PADDLE_ENFORCE(dim0.size() == 2 && dim1.size() == 2, PADDLE_ENFORCE_EQ(dim0.size(), 2,
"The input of mul op must be matrix"); "input X(%s) should be a tensor with 2 dims, a matrix",
PADDLE_ENFORCE( ctx.op_.Input("X"));
dim0[1] == dim1[0], PADDLE_ENFORCE_EQ(dim1.size(), 2,
"input Y(%s) should be a tensor with 2 dims, a matrix",
ctx.op_.Input("Y"));
PADDLE_ENFORCE_EQ(
dim0[1], dim1[0],
"First matrix's width must be equal with second matrix's height."); "First matrix's width must be equal with second matrix's height.");
PADDLE_ENFORCE(ctx.OutputSize() == 1, "The mul op must take one output"); PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1, "The mul op takes only one output");
ctx.Output<Tensor>(0)->Resize({dim0[0], dim1[1]}); ctx.Output<Tensor>(0)->Resize({dim0[0], dim1[1]});
} }
}; };
class MulOpMaker : public OpProtoAndCheckerMaker { class MulOpMaker : public OpProtoAndCheckerMaker {
public: public:
MulOpMaker(OpProto *proto, OpAttrChecker *op_checker) MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The first input of mul op"); AddInput("X", "The first input of mul op");
...@@ -49,7 +53,7 @@ The equation is: Out = X * Y ...@@ -49,7 +53,7 @@ The equation is: Out = X * Y
}; };
class MulOpGrad : public OperatorWithKernel { class MulOpGrad : public OperatorWithKernel {
protected: protected:
void InferShape(const InferShapeContext &ctx) const override {} void InferShape(const InferShapeContext &ctx) const override {}
std::string DebugString() const override { std::string DebugString() const override {
LOG(INFO) << "MulGrad"; LOG(INFO) << "MulGrad";
......
...@@ -21,7 +21,7 @@ namespace operators { ...@@ -21,7 +21,7 @@ namespace operators {
template <typename Place, typename T> template <typename Place, typename T>
class MulKernel : public OpKernel { class MulKernel : public OpKernel {
public: public:
void Compute(const ExecutionContext& context) const override { void Compute(const ExecutionContext& context) const override {
Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = { Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
{Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}}; {Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};
......
...@@ -40,7 +40,7 @@ namespace operators { ...@@ -40,7 +40,7 @@ namespace operators {
* it defines. * it defines.
*/ */
class NetOp : public framework::OperatorBase { class NetOp : public framework::OperatorBase {
public: public:
/** /**
* Infer all the operators' input and output variables' shapes, will be called * Infer all the operators' input and output variables' shapes, will be called
* before every mini-batch * before every mini-batch
...@@ -90,7 +90,7 @@ public: ...@@ -90,7 +90,7 @@ public:
std::vector<std::shared_ptr<OperatorBase>> ops_; std::vector<std::shared_ptr<OperatorBase>> ops_;
private: private:
bool add_op_done_{false}; bool add_op_done_{false};
template <typename T, typename KeyType> template <typename T, typename KeyType>
......
...@@ -12,7 +12,7 @@ static int infer_shape_cnt = 0; ...@@ -12,7 +12,7 @@ static int infer_shape_cnt = 0;
static int run_cnt = 0; static int run_cnt = 0;
class TestOp : public OperatorBase { class TestOp : public OperatorBase {
public: public:
void InferShape(const framework::Scope& scope) const override { void InferShape(const framework::Scope& scope) const override {
++infer_shape_cnt; ++infer_shape_cnt;
} }
...@@ -23,7 +23,7 @@ public: ...@@ -23,7 +23,7 @@ public:
}; };
class EmptyOp : public OperatorBase { class EmptyOp : public OperatorBase {
public: public:
void InferShape(const Scope& scope) const override {} void InferShape(const Scope& scope) const override {}
void Run(const Scope& scope, void Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const override {} const platform::DeviceContext& dev_ctx) const override {}
......
...@@ -25,214 +25,75 @@ ...@@ -25,214 +25,75 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace rnn {
void SegmentInputs(const std::vector<Scope*>& step_scopes,
const std::vector<Link>& inlinks,
const size_t seq_len,
bool infer_shape_mode) {
PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
for (size_t i = 0; i < inlinks.size(); ++i) {
auto input_var = step_scopes[0]->FindVar(inlinks[i].external);
PADDLE_ENFORCE(input_var != nullptr,
"input link [%s] is not in scope.",
inlinks[i].external);
Tensor* input = input_var->GetMutable<Tensor>();
DDim dims = input->dims();
PADDLE_ENFORCE(static_cast<size_t>(dims[0]) == seq_len,
"all the inlinks must have same length");
DDim step_dims = slice_ddim(dims, 1, dims.size());
for (size_t j = 0; j < seq_len; j++) {
Tensor* step_input =
step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable<Tensor>();
if (!infer_shape_mode) {
*step_input = input->Slice<float>(j, j + 1);
}
step_input->Resize(step_dims);
}
}
}
void ConcatOutputs(const std::vector<Scope*>& step_scopes,
const std::vector<Link>& outlinks,
const size_t seq_len,
bool infer_shape_mode) {
for (size_t i = 0; i < outlinks.size(); i++) {
auto output_var = step_scopes[0]->FindVar(outlinks[i].external);
PADDLE_ENFORCE(output_var != nullptr,
"output link [%s] is not in scope.",
outlinks[i].external);
Tensor* output = output_var->GetMutable<Tensor>();
if (infer_shape_mode) {
DDim step_dims = step_scopes[0]
->FindVar(outlinks[i].internal)
->GetMutable<Tensor>()
->dims();
std::vector<int> dims_vec = vectorize(step_dims);
dims_vec.insert(dims_vec.begin(), seq_len);
output->Resize(make_ddim(dims_vec));
} else {
output->mutable_data<float>(platform::CPUPlace());
for (size_t j = 0; j < seq_len; j++) {
Tensor* step_output =
step_scopes[j]->FindVar(outlinks[i].internal)->GetMutable<Tensor>();
// TODO(luotao02) data type and platform::DeviceContext() should set
// correctly
(output->Slice<float>(j, j + 1))
.CopyFrom<float>(*step_output, platform::CPUPlace());
}
}
}
}
void LinkMemories(const std::vector<Scope*>& scopes,
const std::vector<rnn::MemoryAttr>& memories,
const size_t step_id,
const int offset,
bool infer_shape_mode) {
PADDLE_ENFORCE(step_id < scopes.size(),
"step [%d] is out of range of step scopes' size [%d]",
step_id,
scopes.size());
PADDLE_ENFORCE(static_cast<int>(step_id) + offset >= 0,
"offset [%d] must be large than -[%d]",
offset,
step_id);
PADDLE_ENFORCE(step_id + offset < scopes.size(),
"offset [%d] is out of range, it must be less than (%d - %d)",
offset,
scopes.size(),
step_id);
auto scope = scopes[step_id];
auto linked_scope = scopes[step_id + offset];
for (auto& attr : memories) {
auto mem = scope->FindVar(attr.pre_var)->GetMutable<Tensor>();
auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<Tensor>();
if (infer_shape_mode) {
mem->Resize(linked_mem->dims());
} else {
mem->ShareDataWith<float>(*linked_mem);
}
}
}
void InitArgument(const ArgumentName& name,
Argument* arg,
const OperatorBase& op) {
arg->step_net = op.Input(name.step_net);
arg->step_scopes = op.Output(name.step_scopes);
auto inlinks = op.Inputs(name.inlinks);
auto inlink_alias = op.GetAttr<std::vector<std::string>>(name.inlink_alias);
PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(),
"the size of inlinks and inlink_alias don't match:%d,%d",
inlinks.size(),
inlink_alias.size());
for (size_t i = 0; i < inlinks.size(); ++i) {
rnn::Link link;
link.external = inlinks[i];
link.internal = inlink_alias[i];
(arg->inlinks).push_back(link);
}
auto outlinks = op.Outputs(name.outlinks);
auto outlink_alias = op.GetAttr<std::vector<std::string>>(name.outlink_alias);
PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(),
"the size of outlinks and outlink_alias don't match:%d,%d",
outlinks.size(),
outlink_alias.size());
for (size_t i = 0; i < outlinks.size(); ++i) {
rnn::Link link;
link.external = outlinks[i];
link.internal = outlink_alias[i];
(arg->outlinks).push_back(link);
}
auto boot_memories = op.Inputs(name.boot_memories);
// attributes
auto memories = op.GetAttr<std::vector<std::string>>(name.memories);
auto pre_memories = op.GetAttr<std::vector<std::string>>(name.pre_memories);
PADDLE_ENFORCE(memories.size() == boot_memories.size(),
"the size of memories, boot_memories don't match:%d,%d",
memories.size(),
boot_memories.size());
PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(),
"the size of pre_memories, boot_memories don't match:%d,%d",
pre_memories.size(),
boot_memories.size());
PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set");
for (size_t i = 0; i < memories.size(); ++i) {
rnn::MemoryAttr mem_attr;
mem_attr.var = memories[i];
mem_attr.pre_var = pre_memories[i];
mem_attr.boot_var = boot_memories[i];
(arg->memories).push_back(mem_attr);
}
}
} // namespace rnn
void RecurrentAlgorithm::InferShape(const Scope& scope) const { void RecurrentAlgorithm::InferShape(const Scope& scope) const {
seq_len_ = scope.FindVar((arg_->inlinks[0]).external) seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
->GetMutable<Tensor>() ->GetMutable<Tensor>()
->dims()[0]; ->dims()[0];
CreateScopes(scope); CreateScopes(scope);
auto step_scopes = GetStepScopes(scope); auto step_scopes = GetStepScopes(scope);
rnn::SegmentInputs( rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
step_scopes, arg_->inlinks, seq_len_, true /*infer_shape_mode*/); true /*infer_shape_mode*/);
InitMemories(step_scopes[0], true /*infer_shape_mode*/); InitMemories(step_scopes[0], true /*infer_shape_mode*/);
Variable* net = scope.FindVar(arg_->step_net); Variable* net = scope.FindVar(arg_->step_net);
PADDLE_ENFORCE(net != nullptr, "failed to get step net"); PADDLE_ENFORCE(net != nullptr, "failed to get step net");
for (size_t i = 0; i < seq_len_; i++) { for (size_t i = 0; i < seq_len_; i++) {
if (i > 0) { if (i > 0) {
rnn::LinkMemories( rnn::LinkMemories(step_scopes, arg_->memories, i, -1,
step_scopes, arg_->memories, i, -1, true /*infer_shape_mode*/); true /*infer_shape_mode*/);
} }
net->GetMutable<NetOp>()->InferShape(*step_scopes[i]); net->GetMutable<NetOp>()->InferShape(*step_scopes[i]);
} }
rnn::ConcatOutputs( rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
step_scopes, arg_->outlinks, seq_len_, true /*infer_shape_mode*/); true /*infer_shape_mode*/);
} }
void RecurrentAlgorithm::Run(const Scope& scope, void RecurrentAlgorithm::Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const { const platform::DeviceContext& dev_ctx) const {
auto step_scopes = GetStepScopes(scope); auto step_scopes = GetStepScopes(scope);
rnn::SegmentInputs( rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
step_scopes, arg_->inlinks, seq_len_, false /*infer_shape_mode*/); false /*infer_shape_mode*/);
InitMemories(step_scopes[0], false /*infer_shape_mode*/); InitMemories(step_scopes[0], false /*infer_shape_mode*/);
Variable* net = scope.FindVar(arg_->step_net); Variable* net = scope.FindVar(arg_->step_net);
for (size_t step_id = 0; step_id < seq_len_; step_id++) { for (size_t step_id = 0; step_id < seq_len_; step_id++) {
// create output alias variables
if (step_id > 0) { if (step_id > 0) {
rnn::LinkMemories( rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1,
step_scopes, arg_->memories, step_id, -1, false /*infer_shape_mode*/); false /*infer_shape_mode*/);
} }
net->GetMutable<NetOp>()->Run(*step_scopes[step_id], dev_ctx); net->GetMutable<NetOp>()->Run(*step_scopes[step_id], dev_ctx);
} }
rnn::ConcatOutputs( rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
step_scopes, arg_->outlinks, seq_len_, false /*infer_shape_mode*/); false /*infer_shape_mode*/);
} }
void RecurrentAlgorithm::CreateScopes(const Scope& scope) const { void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
// TODO(xxx) Only two scopes are needed for inference, this case will be // TODO(superjom) Only two scopes are needed for inference, this case will be
// supported later. // supported later.
auto step_scopes = auto step_scopes_var = scope.FindVar(arg_->step_scopes);
scope.FindVar(arg_->step_scopes)->GetMutable<std::vector<Scope*>>(); PADDLE_ENFORCE(step_scopes_var != nullptr, "");
auto step_scopes = step_scopes_var->GetMutable<std::vector<Scope*>>();
// Now all variables in scope must be created outside of op.
auto net_var = scope.FindVar(arg_->step_net);
PADDLE_ENFORCE(net_var != nullptr, "no stepnet called %s in scope",
arg_->step_net);
auto net_op = net_var->GetMutable<NetOp>();
PADDLE_ENFORCE(!net_op->outputs_.empty(), "net_op has no outputs");
if (seq_len_ > step_scopes->size()) { if (seq_len_ > step_scopes->size()) {
for (size_t i = step_scopes->size(); i < seq_len_; ++i) { for (size_t i = step_scopes->size(); i < seq_len_; ++i) {
auto& step_scope = scope.NewScope(); auto& step_scope = scope.NewScope();
// Now all variables in scope must be created outside of op. // create step net's temp inputs
auto net_op = scope.FindVar(arg_->step_net)->GetMutable<NetOp>();
for (auto& input : net_op->inputs_) { for (auto& input : net_op->inputs_) {
// the weight are located in parent scope // the weight are located in parent scope
if (!step_scope.FindVar(input)) step_scope.NewVar(input); if (!step_scope.FindVar(input))
step_scope.NewVar(input)->GetMutable<Tensor>();
} }
for (auto& output : net_op->outputs_) { // create stepnet's outputs
for (const auto& output : net_op->outputs_) {
step_scope.NewVar(output); step_scope.NewVar(output);
} }
step_scopes->emplace_back(&step_scope); step_scopes->emplace_back(&step_scope);
...@@ -245,37 +106,27 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope, ...@@ -245,37 +106,27 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope,
for (auto& attr : arg_->memories) { for (auto& attr : arg_->memories) {
Tensor* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<Tensor>(); Tensor* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<Tensor>();
PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr, PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
"memory [%s]'s boot variable [%s] not exists", "memory [%s]'s boot variable [%s] not exists", attr.var,
attr.var,
attr.boot_var); attr.boot_var);
Tensor* boot_mem = step_scope->FindVar(attr.boot_var)->GetMutable<Tensor>(); Tensor* boot_mem = step_scope->FindVar(attr.boot_var)->GetMutable<Tensor>();
if (infer_shape_mode) { if (infer_shape_mode) {
pre_mem->Resize(boot_mem->dims()); pre_mem->Resize(boot_mem->dims());
PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
} else { } else {
pre_mem->ShareDataWith<float>(*boot_mem); pre_mem->ShareDataWith<float>(*boot_mem);
} }
} }
} }
const rnn::ArgumentName RecurrentOp::kArgName{"step_net", const rnn::ArgumentName RecurrentOp::kArgName{
"step_scopes", "step_net", "step_scopes", "inlinks",
"inlinks", "outlinks", "inlink_alias", "outlink_alias",
"outlinks", "memories", "pre_memories", "boot_memories"};
"inlink_alias",
"outlink_alias",
"memories",
"pre_memories",
"boot_memories"};
const rnn::ArgumentName RecurrentGradientOp::kArgName{"step_net", const rnn::ArgumentName RecurrentGradientOp::kArgName{
"step_scopes", "step_net", "step_scopes", "outlink@grad",
"outlink@grad", "inlink@grad", "inlink_alias", "outlink_alias",
"inlink@grad", "memories", "pre_memories", "boot_memories@grad"};
"inlink_alias",
"outlink_alias",
"memories",
"pre_memories",
"boot_memories@grad"};
void RecurrentOp::Init() { void RecurrentOp::Init() {
OperatorBase::Init(); OperatorBase::Init();
...@@ -285,7 +136,7 @@ void RecurrentOp::Init() { ...@@ -285,7 +136,7 @@ void RecurrentOp::Init() {
} }
class RecurrentAlgorithmProtoAndCheckerMaker : public OpProtoAndCheckerMaker { class RecurrentAlgorithmProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
public: public:
RecurrentAlgorithmProtoAndCheckerMaker(OpProto* proto, RecurrentAlgorithmProtoAndCheckerMaker(OpProto* proto,
OpAttrChecker* op_checker) OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
...@@ -316,31 +167,29 @@ public: ...@@ -316,31 +167,29 @@ public:
void RecurrentGradientAlgorithm::Run( void RecurrentGradientAlgorithm::Run(
const Scope& scope, const platform::DeviceContext& dev_ctx) const { const Scope& scope, const platform::DeviceContext& dev_ctx) const {
auto step_scopes = GetStepScopes(scope); auto step_scopes = GetStepScopes(scope);
rnn::SegmentInputs( rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
step_scopes, arg_->inlinks, seq_len_, false /*infer_shape_mode*/); false /*infer_shape_mode*/);
Variable* net = scope.FindVar(arg_->step_net); Variable* net = scope.FindVar(arg_->step_net);
PADDLE_ENFORCE(net != nullptr, "failed to get step net"); PADDLE_ENFORCE(net != nullptr, "failed to get step net");
for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
if (static_cast<size_t>(step_id) != seq_len_ - 1) { if (static_cast<size_t>(step_id) != seq_len_ - 1) {
rnn::LinkMemories( rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1,
step_scopes, arg_->memories, step_id, 1, false /*infer_shape_mode*/); false /*infer_shape_mode*/);
} }
net->GetMutable<NetOp>()->Run(*step_scopes[step_id], dev_ctx); net->GetMutable<NetOp>()->Run(*step_scopes[step_id], dev_ctx);
} }
LinkBootMemoryGradients(step_scopes[0], false); LinkBootMemoryGradients(step_scopes[0], false);
rnn::ConcatOutputs( rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
step_scopes, arg_->outlinks, seq_len_, false /*infer_shape_mode*/); false /*infer_shape_mode*/);
} }
void RecurrentGradientAlgorithm::LinkBootMemoryGradients( void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
Scope* step_scope, bool infer_shape_mode) const { Scope* step_scope, bool infer_shape_mode) const {
for (auto& attr : arg_->memories) { for (auto& attr : arg_->memories) {
PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr, PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr,
"memory variable [%s] does not exists", "memory variable [%s] does not exists", attr.var);
attr.var);
PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr, PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
"boot variable [%s] does not exists", "boot variable [%s] does not exists", attr.boot_var);
attr.boot_var);
Tensor* mem_grad = step_scope->NewVar(attr.var)->GetMutable<Tensor>(); Tensor* mem_grad = step_scope->NewVar(attr.var)->GetMutable<Tensor>();
Tensor* boot_mem_grad = Tensor* boot_mem_grad =
step_scope->NewVar(attr.boot_var)->GetMutable<Tensor>(); step_scope->NewVar(attr.boot_var)->GetMutable<Tensor>();
...@@ -357,19 +206,19 @@ void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const { ...@@ -357,19 +206,19 @@ void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
->GetMutable<Tensor>() ->GetMutable<Tensor>()
->dims()[0]; ->dims()[0];
auto step_scopes = GetStepScopes(scope); auto step_scopes = GetStepScopes(scope);
rnn::SegmentInputs( rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
step_scopes, arg_->inlinks, seq_len_, true /*infer_shape_mode*/); true /*infer_shape_mode*/);
Variable* net = scope.FindVar(arg_->step_net); Variable* net = scope.FindVar(arg_->step_net);
PADDLE_ENFORCE(net != nullptr, "failed to get step net"); PADDLE_ENFORCE(net != nullptr, "failed to get step net");
for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
if (static_cast<size_t>(step_id) != seq_len_ - 1) { if (static_cast<size_t>(step_id) != seq_len_ - 1) {
rnn::LinkMemories( rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1,
step_scopes, arg_->memories, step_id, 1, true /*infer_shape_mode*/); true /*infer_shape_mode*/);
} }
net->GetMutable<NetOp>()->InferShape(*step_scopes[step_id]); net->GetMutable<NetOp>()->InferShape(*step_scopes[step_id]);
} }
rnn::ConcatOutputs( rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
step_scopes, arg_->outlinks, seq_len_, true /*infer_shape_mode*/); true /*infer_shape_mode*/);
LinkBootMemoryGradients(step_scopes[0], true /*infer_shape_mode*/); LinkBootMemoryGradients(step_scopes[0], true /*infer_shape_mode*/);
} }
...@@ -383,6 +232,5 @@ void RecurrentGradientOp::Init() { ...@@ -383,6 +232,5 @@ void RecurrentGradientOp::Init() {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP(recurrent_op, REGISTER_OP(recurrent_op, paddle::operators::RecurrentOp,
paddle::operators::RecurrentOp,
paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker); paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker);
...@@ -15,84 +15,11 @@ ...@@ -15,84 +15,11 @@
#pragma once #pragma once
#include "paddle/framework/operator.h" #include "paddle/framework/operator.h"
#include "paddle/operators/rnn/recurrent_op_utils.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using namespace paddle::framework; // NOLINT
namespace rnn {
/**
* Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
*
* Memory attributes cached by this op, dims will be infered from
* boot memories in father scope. Other attributes are copied from Op's proto
* attributes.
*/
struct MemoryAttr {
// name of current state variable
std::string var;
// name of previous step's state variable
std::string pre_var;
// name of the variables to init this memory (same role of `boot_layer` in
// PaddlePaddle), which is store in father's scope.
std::string boot_var;
};
struct Link {
// input or output links name.
std::string internal;
// alias to avoid duplicate keys in scopes.
std::string external;
};
struct Argument {
std::string step_net;
std::string step_scopes;
std::vector<Link> inlinks;
std::vector<Link> outlinks;
std::vector<rnn::MemoryAttr> memories;
};
struct ArgumentName {
std::string step_net;
std::string step_scopes;
std::string inlinks;
std::string outlinks;
std::string inlink_alias; // the alias of inlinks in step net.
std::string outlink_alias; // the alias of outlinks in step net.
std::string memories; // the memory name
std::string pre_memories; // the previous memory name
std::string boot_memories; // the boot memory name
};
/**
* Prepare inputs for each step net.
*/
void SegmentInputs(const std::vector<Scope*>& step_scopes,
const std::vector<Link>& inlinks,
const size_t seq_len,
bool infer_shape_mode);
/**
* Process outputs of step nets and merge to variables.
*/
void ConcatOutputs(const std::vector<Scope*>& step_scopes,
const std::vector<Link>& outlinks,
const size_t seq_len,
bool infer_shape_mode);
void LinkMemories(const std::vector<Scope*>& step_scopes,
const std::vector<MemoryAttr>& memories,
const size_t step_id,
const int offset,
bool infer_shape_mode);
void InitArgument(const ArgumentName& name, Argument* arg);
}; // namespace rnn
// The sequence format in RecurrentOp is Tensor<seq_len, batch_size, dim> now. // The sequence format in RecurrentOp is Tensor<seq_len, batch_size, dim> now.
// TODO(Yan Chunwei): // TODO(Yan Chunwei):
// 1. No-padding computing for sequences with indifinite length in one batch. // 1. No-padding computing for sequences with indifinite length in one batch.
...@@ -102,32 +29,35 @@ void InitArgument(const ArgumentName& name, Argument* arg); ...@@ -102,32 +29,35 @@ void InitArgument(const ArgumentName& name, Argument* arg);
// Refer to: https://arxiv.org/pdf/1502.02367.pdf // Refer to: https://arxiv.org/pdf/1502.02367.pdf
class RecurrentAlgorithm { class RecurrentAlgorithm {
public: public:
void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const; void Run(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const;
void Init(std::unique_ptr<rnn::Argument> arg) { arg_ = std::move(arg); } void Init(std::unique_ptr<rnn::Argument> arg) { arg_ = std::move(arg); }
/** /**
* InferShape must be called before Run. * InferShape must be called before Run.
*/ */
void InferShape(const Scope& scope) const; void InferShape(const framework::Scope& scope) const;
protected: protected:
/* /*
* The step scopes will be stored in the father scope as a variable. * The step scopes will be stored in the father scope as a variable.
* *
* NOTE the scopes are reused in both the forward and backward, so just * NOTE the scopes are reused in both the forward and backward, so just
* create once and expand its size if more steps need. * create once and expand its size if more steps need.
*/ */
void CreateScopes(const Scope& scope) const; void CreateScopes(const framework::Scope& scope) const;
const std::vector<Scope*>& GetStepScopes(const Scope& scope) const { const std::vector<framework::Scope*>& GetStepScopes(
return *scope.FindVar(arg_->step_scopes)->GetMutable<std::vector<Scope*>>(); const framework::Scope& scope) const {
return *scope.FindVar(arg_->step_scopes)
->GetMutable<std::vector<framework::Scope*>>();
} }
void InitMemories(Scope* step_scopes, bool infer_shape_mode) const; void InitMemories(framework::Scope* step_scopes, bool infer_shape_mode) const;
private: private:
std::unique_ptr<rnn::Argument> arg_; std::unique_ptr<rnn::Argument> arg_;
mutable size_t seq_len_; mutable size_t seq_len_;
}; };
...@@ -143,65 +73,73 @@ class RecurrentGradientAlgorithm { ...@@ -143,65 +73,73 @@ class RecurrentGradientAlgorithm {
* lot, and the latter is a wrapper acts like an dapter for it to make RNN an * lot, and the latter is a wrapper acts like an dapter for it to make RNN an
* operator. * operator.
*/ */
public: public:
void Init(std::unique_ptr<rnn::Argument> arg) { arg_ = std::move(arg); } void Init(std::unique_ptr<rnn::Argument> arg) { arg_ = std::move(arg); }
void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const; void Run(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const;
void LinkBootMemoryGradients(Scope* step_scopes, bool infer_shape_mode) const; void LinkBootMemoryGradients(framework::Scope* step_scopes,
bool infer_shape_mode) const;
/** /**
* InferShape must be called before Run. * InferShape must be called before Run.
*/ */
void InferShape(const Scope& scope) const; void InferShape(const framework::Scope& scope) const;
protected: protected:
inline const std::vector<Scope*>& GetStepScopes(const Scope& scope) const { inline const std::vector<framework::Scope*>& GetStepScopes(
return *scope.FindVar(arg_->step_scopes)->GetMutable<std::vector<Scope*>>(); const framework::Scope& scope) const {
return *scope.FindVar(arg_->step_scopes)
->GetMutable<std::vector<framework::Scope*>>();
} }
private: private:
std::unique_ptr<rnn::Argument> arg_; std::unique_ptr<rnn::Argument> arg_;
mutable size_t seq_len_; mutable size_t seq_len_;
}; };
class RecurrentOp final : public OperatorBase { class RecurrentOp final : public framework::OperatorBase {
public: public:
void Init() override; void Init() override;
/** /**
* InferShape must be called before Run. * InferShape must be called before Run.
*/ */
void InferShape(const Scope& scope) const override { alg_.InferShape(scope); } void InferShape(const framework::Scope& scope) const override {
alg_.InferShape(scope);
}
void Run(const Scope& scope, void Run(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const override { const platform::DeviceContext& dev_ctx) const override {
alg_.Run(scope, dev_ctx); alg_.Run(scope, dev_ctx);
} }
static const rnn::ArgumentName kArgName; static const rnn::ArgumentName kArgName;
private: private:
RecurrentAlgorithm alg_; RecurrentAlgorithm alg_;
}; };
class RecurrentGradientOp final : public OperatorBase { class RecurrentGradientOp final : public framework::OperatorBase {
public: public:
void Init() override; void Init() override;
/** /**
* InferShape must be called before Run. * InferShape must be called before Run.
*/ */
void InferShape(const Scope& scope) const override { alg_.InferShape(scope); } void InferShape(const framework::Scope& scope) const override {
alg_.InferShape(scope);
}
void Run(const Scope& scope, void Run(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const override { const platform::DeviceContext& dev_ctx) const override {
alg_.Run(scope, dev_ctx); alg_.Run(scope, dev_ctx);
} }
static const rnn::ArgumentName kArgName; static const rnn::ArgumentName kArgName;
private: private:
RecurrentGradientAlgorithm alg_; RecurrentGradientAlgorithm alg_;
}; };
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/framework/ddim.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h" #include "paddle/framework/operator.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
...@@ -24,8 +25,11 @@ ...@@ -24,8 +25,11 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using framework::make_ddim;
using framework::DDim;
class RecurrentOpTest : public ::testing::Test { class RecurrentOpTest : public ::testing::Test {
protected: protected:
virtual void SetUp() override { virtual void SetUp() override {
CreateGlobalVariables(); CreateGlobalVariables();
CreateStepNet(); CreateStepNet();
...@@ -72,7 +76,7 @@ protected: ...@@ -72,7 +76,7 @@ protected:
} }
void CreateRNNOp() { void CreateRNNOp() {
OpDesc op_desc; framework::OpDesc op_desc;
op_desc.set_type("recurrent_op"); op_desc.set_type("recurrent_op");
// inlinks 0 // inlinks 0
...@@ -170,7 +174,7 @@ TEST_F(RecurrentOpTest, Run) { ...@@ -170,7 +174,7 @@ TEST_F(RecurrentOpTest, Run) {
} }
class RecurrentGradientAlgorithmTest : public ::testing::Test { class RecurrentGradientAlgorithmTest : public ::testing::Test {
protected: protected:
virtual void SetUp() override { virtual void SetUp() override {
CreateGlobalVariables(); CreateGlobalVariables();
CreateStepScopes(); CreateStepScopes();
...@@ -273,13 +277,11 @@ protected: ...@@ -273,13 +277,11 @@ protected:
LOG(INFO) << "create variable step_net"; LOG(INFO) << "create variable step_net";
Variable* var = scope_.NewVar("step_net"); Variable* var = scope_.NewVar("step_net");
auto net = var->GetMutable<NetOp>(); auto net = var->GetMutable<NetOp>();
net->AddOp(OpRegistry::CreateOp("mul", net->AddOp(OpRegistry::CreateOp("mul", {"rnn/h_pre", "rnn/w", "rnn/s_grad"},
{"rnn/h_pre", "rnn/w", "rnn/s_grad"}, {"rnn/h_pre_grad", "rnn/w_grad"}, {}));
{"rnn/h_pre_grad", "rnn/w_grad"},
{}));
net->AddOp(OpRegistry::CreateOp( net->AddOp(OpRegistry::CreateOp("add_two", {"rnn/h_grad"},
"add_two", {"rnn/h_grad"}, {"rnn/x_grad", "rnn/s_grad"}, {})); {"rnn/x_grad", "rnn/s_grad"}, {}));
net->CompleteAddOp(); net->CompleteAddOp();
} }
...@@ -293,9 +295,7 @@ protected: ...@@ -293,9 +295,7 @@ protected:
inlink.internal = "rnn/x"; inlink.internal = "rnn/x";
auto step_scopes = auto step_scopes =
scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>(); scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>();
rnn::SegmentInputs(*step_scopes, rnn::SegmentInputs(*step_scopes, std::vector<rnn::Link>{inlink}, 10,
std::vector<rnn::Link>{inlink},
10,
true /*infer_shape_mode*/); true /*infer_shape_mode*/);
} }
...@@ -310,8 +310,8 @@ protected: ...@@ -310,8 +310,8 @@ protected:
auto step_scopes = auto step_scopes =
scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>(); scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>();
for (int i = 1; i < 10; ++i) { for (int i = 1; i < 10; ++i) {
rnn::LinkMemories( rnn::LinkMemories(*step_scopes, memories, i, -1,
*step_scopes, memories, i, -1, true /*infer_shape_mode*/); true /*infer_shape_mode*/);
} }
} }
...@@ -391,3 +391,4 @@ TEST(RecurrentOp, LinkMemories) { ...@@ -391,3 +391,4 @@ TEST(RecurrentOp, LinkMemories) {
USE_OP(add_two); USE_OP(add_two);
USE_OP(mul); USE_OP(mul);
USE_OP_WITHOUT_KERNEL(recurrent_op);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/rnn/recurrent_op_utils.h"
namespace paddle {
namespace operators {
namespace rnn {
namespace fmw = paddle::framework;
void SegmentInputs(const std::vector<Scope*>& step_scopes,
const std::vector<Link>& inlinks, const size_t seq_len,
bool infer_shape_mode) {
PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
for (size_t i = 0; i < inlinks.size(); ++i) {
auto input_var = step_scopes[0]->FindVar(inlinks[i].external);
PADDLE_ENFORCE(input_var != nullptr, "input link [%s] is not in scope.",
inlinks[i].external);
Tensor* input = input_var->GetMutable<Tensor>();
fmw::DDim dims = input->dims();
PADDLE_ENFORCE(static_cast<size_t>(dims[0]) == seq_len,
"all the inlinks must have same length");
fmw::DDim step_dims = slice_ddim(dims, 1, dims.size());
for (size_t j = 0; j < seq_len; j++) {
Tensor* step_input =
step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable<Tensor>();
if (!infer_shape_mode) {
*step_input = input->Slice<float>(j, j + 1);
}
step_input->Resize(step_dims);
}
}
}
void ConcatOutputs(const std::vector<Scope*>& step_scopes,
const std::vector<Link>& outlinks, const size_t seq_len,
bool infer_shape_mode) {
for (size_t i = 0; i < outlinks.size(); i++) {
auto output_var = step_scopes[0]->FindVar(outlinks[i].external);
PADDLE_ENFORCE(output_var != nullptr, "output link [%s] is not in scope.",
outlinks[i].external);
Tensor* output = output_var->GetMutable<Tensor>();
if (infer_shape_mode) {
auto step_scope_var = step_scopes[0]->FindVar(outlinks[i].internal);
PADDLE_ENFORCE(step_scope_var != nullptr, "%s not in scope",
outlinks[i].internal);
fmw::DDim step_dims =
step_scope_var->template GetMutable<Tensor>()->dims();
std::vector<int> dims_vec = vectorize(step_dims);
dims_vec.insert(dims_vec.begin(), seq_len);
output->Resize(fmw::make_ddim(dims_vec));
} else {
output->mutable_data<float>(platform::CPUPlace());
for (size_t j = 0; j < seq_len; j++) {
Tensor* step_output =
step_scopes[j]->FindVar(outlinks[i].internal)->GetMutable<Tensor>();
// TODO(luotao02) data type and platform::DeviceContext() should set
// correctly
(output->Slice<float>(j, j + 1))
.CopyFrom<float>(*step_output, platform::CPUPlace());
}
}
}
}
void LinkMemories(const std::vector<Scope*>& scopes,
const std::vector<rnn::MemoryAttr>& memories,
const size_t step_id, const int offset,
bool infer_shape_mode) {
PADDLE_ENFORCE_LT(step_id, scopes.size(),
"step [%d] is out of range of step scopes' size [%d]",
step_id, scopes.size());
PADDLE_ENFORCE_GE(static_cast<int>(step_id) + offset, 0,
"offset [%d] must be large than -[%d]", offset, step_id);
PADDLE_ENFORCE_LT(
step_id + offset, scopes.size(),
"offset [%d] is out of range, it must be less than (%d - %d)", offset,
scopes.size(), step_id);
auto scope = scopes[step_id];
auto linked_scope = scopes[step_id + offset];
for (auto& attr : memories) {
auto mem = scope->FindVar(attr.pre_var)->GetMutable<Tensor>();
auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<Tensor>();
if (infer_shape_mode) {
mem->Resize(linked_mem->dims());
} else {
mem->ShareDataWith<float>(*linked_mem);
}
}
}
void InitArgument(const ArgumentName& name, Argument* arg,
const OperatorBase& op) {
arg->step_net = op.Input(name.step_net);
arg->step_scopes = op.Output(name.step_scopes);
auto inlinks = op.Inputs(name.inlinks);
auto inlink_alias = op.GetAttr<std::vector<std::string>>(name.inlink_alias);
PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(),
"the size of inlinks and inlink_alias don't match:%d,%d",
inlinks.size(), inlink_alias.size());
for (size_t i = 0; i < inlinks.size(); ++i) {
rnn::Link link;
link.external = inlinks[i];
link.internal = inlink_alias[i];
(arg->inlinks).push_back(link);
}
auto outlinks = op.Outputs(name.outlinks);
auto outlink_alias = op.GetAttr<std::vector<std::string>>(name.outlink_alias);
PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(),
"the size of outlinks and outlink_alias don't match:%d,%d",
outlinks.size(), outlink_alias.size());
for (size_t i = 0; i < outlinks.size(); ++i) {
rnn::Link link;
link.external = outlinks[i];
link.internal = outlink_alias[i];
(arg->outlinks).push_back(link);
}
auto boot_memories = op.Inputs(name.boot_memories);
// attributes
auto memories = op.GetAttr<std::vector<std::string>>(name.memories);
auto pre_memories = op.GetAttr<std::vector<std::string>>(name.pre_memories);
PADDLE_ENFORCE(memories.size() == boot_memories.size(),
"the size of memories, boot_memories don't match:%d,%d",
memories.size(), boot_memories.size());
PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(),
"the size of pre_memories, boot_memories don't match:%d,%d",
pre_memories.size(), boot_memories.size());
PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set");
for (size_t i = 0; i < memories.size(); ++i) {
rnn::MemoryAttr mem_attr;
mem_attr.var = memories[i];
mem_attr.pre_var = pre_memories[i];
mem_attr.boot_var = boot_memories[i];
(arg->memories).push_back(mem_attr);
}
}
} // namespace rnn
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/framework/operator.h"
#include "paddle/operators/type_alias.h"
namespace paddle {
namespace operators {
namespace rnn {
/**
* Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
*
* Memory attributes cached by this op, dims will be infered from
* boot memories in father scope. Other attributes are copied from Op's proto
* attributes.
*/
struct MemoryAttr {
// name of current state variable
std::string var;
// name of previous step's state variable
std::string pre_var;
// name of the variables to init this memory (same role of `boot_layer` in
// PaddlePaddle), which is store in father's scope.
std::string boot_var;
};
struct Link {
// input or output links name.
std::string internal;
// alias to avoid duplicate keys in scopes.
std::string external;
};
struct Argument {
std::string step_net;
std::string step_scopes;
std::vector<Link> inlinks;
std::vector<Link> outlinks;
std::vector<rnn::MemoryAttr> memories;
};
struct ArgumentName {
std::string step_net;
std::string step_scopes;
std::string inlinks;
std::string outlinks;
std::string inlink_alias; // the alias of inlinks in step net.
std::string outlink_alias; // the alias of outlinks in step net.
std::string memories; // the memory name
std::string pre_memories; // the previous memory name
std::string boot_memories; // the boot memory name
};
/**
* Prepare inputs for each step net.
*/
void SegmentInputs(const std::vector<Scope*>& step_scopes,
const std::vector<Link>& inlinks, const size_t seq_len,
bool infer_shape_mode);
/**
* Process outputs of step nets and merge to variables.
*/
void ConcatOutputs(const std::vector<Scope*>& step_scopes,
const std::vector<Link>& outlinks, const size_t seq_len,
bool infer_shape_mode);
void LinkMemories(const std::vector<Scope*>& step_scopes,
const std::vector<MemoryAttr>& memories, const size_t step_id,
const int offset, bool infer_shape_mode);
void InitArgument(const ArgumentName& name, Argument* arg,
const OperatorBase& op);
} // namespace rnn
} // namespace operators
} // namespace paddle
...@@ -17,7 +17,7 @@ namespace paddle { ...@@ -17,7 +17,7 @@ namespace paddle {
namespace operators { namespace operators {
class RowWiseAddOp : public OperatorWithKernel { class RowWiseAddOp : public OperatorWithKernel {
protected: protected:
void InferShape(const InferShapeContext &ctx) const override { void InferShape(const InferShapeContext &ctx) const override {
PADDLE_ENFORCE(ctx.InputSize() == 2UL, PADDLE_ENFORCE(ctx.InputSize() == 2UL,
"Two inputs is needed by rowwise add"); "Two inputs is needed by rowwise add");
...@@ -33,7 +33,7 @@ protected: ...@@ -33,7 +33,7 @@ protected:
}; };
class RowWiseAddOpMaker : public OpProtoAndCheckerMaker { class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
public: public:
RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker) RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The left input of row-wise add op, must be matrix"); AddInput("X", "The left input of row-wise add op, must be matrix");
......
...@@ -20,7 +20,7 @@ namespace operators { ...@@ -20,7 +20,7 @@ namespace operators {
template <typename Place, typename T> template <typename Place, typename T>
class RowWiseAddKernel : public OpKernel { class RowWiseAddKernel : public OpKernel {
public: public:
void Compute(const ExecutionContext& context) const override { void Compute(const ExecutionContext& context) const override {
auto out = context.Output<Tensor>(0); auto out = context.Output<Tensor>(0);
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
......
...@@ -18,7 +18,7 @@ namespace paddle { ...@@ -18,7 +18,7 @@ namespace paddle {
namespace operators { namespace operators {
class SGDOp : public OperatorWithKernel { class SGDOp : public OperatorWithKernel {
protected: protected:
void InferShape(const InferShapeContext &ctx) const override { void InferShape(const InferShapeContext &ctx) const override {
PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of SGDOp must be two"); PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of SGDOp must be two");
PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of SGDOp must be one"); PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of SGDOp must be one");
...@@ -32,7 +32,7 @@ protected: ...@@ -32,7 +32,7 @@ protected:
}; };
class SGDOpMaker : public OpProtoAndCheckerMaker { class SGDOpMaker : public OpProtoAndCheckerMaker {
public: public:
SGDOpMaker(OpProto *proto, OpAttrChecker *op_checker) SGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("param", "input parameter"); AddInput("param", "input parameter");
......
...@@ -20,7 +20,7 @@ namespace operators { ...@@ -20,7 +20,7 @@ namespace operators {
template <typename Place, typename T> template <typename Place, typename T>
class SGDOpKernel : public OpKernel { class SGDOpKernel : public OpKernel {
public: public:
void Compute(const ExecutionContext& ctx) const override { void Compute(const ExecutionContext& ctx) const override {
auto param = ctx.Input<Tensor>("param"); auto param = ctx.Input<Tensor>("param");
auto grad = ctx.Input<Tensor>("grad"); auto grad = ctx.Input<Tensor>("grad");
......
...@@ -17,7 +17,7 @@ namespace paddle { ...@@ -17,7 +17,7 @@ namespace paddle {
namespace operators { namespace operators {
class SigmoidOp : public OperatorWithKernel { class SigmoidOp : public OperatorWithKernel {
protected: protected:
void InferShape(const InferShapeContext &ctx) const override { void InferShape(const InferShapeContext &ctx) const override {
PADDLE_ENFORCE(ctx.InputSize() == 1, "Sigmoid Op only have one input"); PADDLE_ENFORCE(ctx.InputSize() == 1, "Sigmoid Op only have one input");
PADDLE_ENFORCE(ctx.OutputSize() == 1, "Sigmoid Op only have one output"); PADDLE_ENFORCE(ctx.OutputSize() == 1, "Sigmoid Op only have one output");
...@@ -26,7 +26,7 @@ protected: ...@@ -26,7 +26,7 @@ protected:
}; };
class SigmoidOpMaker : public OpProtoAndCheckerMaker { class SigmoidOpMaker : public OpProtoAndCheckerMaker {
public: public:
SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker) SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "sigmoid input"); AddInput("X", "sigmoid input");
...@@ -36,11 +36,9 @@ public: ...@@ -36,11 +36,9 @@ public:
}; };
class SigmoidOpGrad : public OperatorWithKernel { class SigmoidOpGrad : public OperatorWithKernel {
protected: protected:
void InferShape(const InferShapeContext &ctx) const override {} void InferShape(const InferShapeContext &ctx) const override {
std::string DebugString() const override { ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
LOG(INFO) << "SigmoidGrad";
return "";
} }
}; };
...@@ -51,3 +49,5 @@ REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker); ...@@ -51,3 +49,5 @@ REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker);
REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, ops::SigmoidOpGrad); REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, ops::SigmoidOpGrad);
REGISTER_OP_CPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::CPUPlace, float>); REGISTER_OP_CPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(sigmoid_grad,
ops::SigmoidGradKernel<ops::CPUPlace, float>);
...@@ -16,3 +16,5 @@ ...@@ -16,3 +16,5 @@
#include "paddle/operators/sigmoid_op.h" #include "paddle/operators/sigmoid_op.h"
REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::GPUPlace, float>); REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(sigmoid_grad,
ops::SigmoidGradKernel<ops::GPUPlace, float>);
...@@ -21,12 +21,13 @@ namespace operators { ...@@ -21,12 +21,13 @@ namespace operators {
template <typename Place, typename T> template <typename Place, typename T>
class SigmoidKernel : public OpKernel { class SigmoidKernel : public OpKernel {
public: public:
void Compute(const ExecutionContext& context) const override { void Compute(const ExecutionContext& context) const override {
auto input = context.Input<Tensor>(0); auto input = context.Input<Tensor>(0);
auto output = context.Output<Tensor>(0); auto output = context.Output<Tensor>(0);
output->mutable_data<T>(context.GetPlace()); output->mutable_data<T>(context.GetPlace());
// The clipping is used in Paddle's raw implenmention
auto X = EigenVector<T>::Flatten(*input); auto X = EigenVector<T>::Flatten(*input);
auto Y = EigenVector<T>::Flatten(*output); auto Y = EigenVector<T>::Flatten(*output);
auto place = context.GetEigenDevice<Place>(); auto place = context.GetEigenDevice<Place>();
...@@ -34,5 +35,23 @@ public: ...@@ -34,5 +35,23 @@ public:
Y.device(place) = 1.0 / (1.0 + (-1.0 * X).exp()); Y.device(place) = 1.0 / (1.0 + (-1.0 * X).exp());
} }
}; };
template <typename Place, typename T>
class SigmoidGradKernel : public OpKernel {
public:
void Compute(const ExecutionContext& context) const override {
auto Y_t = context.Input<Tensor>("Y");
auto dY_t = context.Input<Tensor>(framework::GradVarName("Y"));
auto dX_t = context.Output<Tensor>(framework::GradVarName("X"));
dX_t->mutable_data<T>(context.GetPlace());
auto dX = EigenVector<T>::Flatten(*dX_t);
auto Y = EigenVector<T>::Flatten(*Y_t);
auto dY = EigenVector<T>::Flatten(*dY_t);
dX.device(context.GetEigenDevice<Place>()) = dY * Y * (1. - Y);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -18,7 +18,7 @@ namespace paddle { ...@@ -18,7 +18,7 @@ namespace paddle {
namespace operators { namespace operators {
class SoftmaxOp : public OperatorWithKernel { class SoftmaxOp : public OperatorWithKernel {
protected: protected:
void InferShape(const InferShapeContext &ctx) const override { void InferShape(const InferShapeContext &ctx) const override {
PADDLE_ENFORCE(ctx.InputSize() == 1UL, PADDLE_ENFORCE(ctx.InputSize() == 1UL,
"Only one input is need for softmax"); "Only one input is need for softmax");
...@@ -31,7 +31,7 @@ protected: ...@@ -31,7 +31,7 @@ protected:
}; };
class SoftmaxOpMaker : public OpProtoAndCheckerMaker { class SoftmaxOpMaker : public OpProtoAndCheckerMaker {
public: public:
SoftmaxOpMaker(OpProto *proto, OpAttrChecker *op_checker) SoftmaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "input of softmax"); AddInput("X", "input of softmax");
...@@ -41,19 +41,19 @@ public: ...@@ -41,19 +41,19 @@ public:
}; };
class SoftmaxOpGrad : public OperatorWithKernel { class SoftmaxOpGrad : public OperatorWithKernel {
protected: protected:
void InferShape(const InferShapeContext &ctx) const override { void InferShape(const InferShapeContext &ctx) const override {
PADDLE_ENFORCE(ctx.InputSize() == 3UL, PADDLE_ENFORCE(ctx.InputSize() == 3UL,
"Input of SoftmaxOpGrad should be 3, X, Y, YG"); "Input of SoftmaxOpGrad should be 3, X, Y, YG");
PADDLE_ENFORCE(ctx.OutputSize() == 1UL, PADDLE_ENFORCE(ctx.OutputSize() == 1UL,
"Output of SoftmaxOpGrad should be 1"); "Output of SoftmaxOpGrad should be 1");
PADDLE_ENFORCE(ctx.InputVar("Y") != nullptr, "Input(Y) should not be null"); PADDLE_ENFORCE(ctx.InputVar("Y") != nullptr, "Input(Y) should not be null");
PADDLE_ENFORCE(ctx.InputVar(GRAD_VAR_NAME("Y")) != nullptr, PADDLE_ENFORCE(ctx.InputVar(framework::GradVarName("Y")) != nullptr,
"Input(Y@GRAD) should not be null"); "Input(Y@GRAD) should not be null");
PADDLE_ENFORCE(ctx.Input<Tensor>("Y")->dims() == PADDLE_ENFORCE(ctx.Input<Tensor>("Y")->dims() ==
ctx.Input<Tensor>(GRAD_VAR_NAME("Y"))->dims(), ctx.Input<Tensor>(framework::GradVarName("Y"))->dims(),
"the shape of Input(0) and Input(1) should be the same"); "the shape of Input(0) and Input(1) should be the same");
ctx.Output<Tensor>(GRAD_VAR_NAME("X")) ctx.Output<Tensor>(framework::GradVarName("X"))
->Resize(ctx.Input<Tensor>("Y")->dims()); ->Resize(ctx.Input<Tensor>("Y")->dims());
} }
}; };
......
...@@ -24,7 +24,7 @@ namespace operators { ...@@ -24,7 +24,7 @@ namespace operators {
template <typename Place, typename T> template <typename Place, typename T>
class SoftmaxKernel : public OpKernel { class SoftmaxKernel : public OpKernel {
public: public:
void Compute(const ExecutionContext& context) const override { void Compute(const ExecutionContext& context) const override {
auto input = context.Input<Tensor>("X"); auto input = context.Input<Tensor>("X");
auto output = context.Output<Tensor>("Y"); auto output = context.Output<Tensor>("Y");
...@@ -63,13 +63,13 @@ public: ...@@ -63,13 +63,13 @@ public:
template <typename Place, typename T> template <typename Place, typename T>
class SoftmaxGradKernel : public OpKernel { class SoftmaxGradKernel : public OpKernel {
public: public:
void Compute(const ExecutionContext& context) const override { void Compute(const ExecutionContext& context) const override {
std::shared_ptr<Tensor> scale_ = std::make_shared<Tensor>(); std::shared_ptr<Tensor> scale_ = std::make_shared<Tensor>();
auto Y = context.Input<Tensor>("Y"); auto Y = context.Input<Tensor>("Y");
auto dY = context.Input<Tensor>(OperatorBase::GRAD_VAR_NAME("Y")); auto dY = context.Input<Tensor>(framework::GradVarName("Y"));
auto dX = context.Output<Tensor>(OperatorBase::GRAD_VAR_NAME("X")); auto dX = context.Output<Tensor>(framework::GradVarName("X"));
dX->mutable_data<T>(context.GetPlace()); dX->mutable_data<T>(context.GetPlace());
const int batch_size = Y->dims()[0]; const int batch_size = Y->dims()[0];
......
...@@ -26,21 +26,16 @@ using OperatorBase = framework::OperatorBase; ...@@ -26,21 +26,16 @@ using OperatorBase = framework::OperatorBase;
using InferShapeContext = framework::InferShapeContext; using InferShapeContext = framework::InferShapeContext;
using ExecutionContext = framework::ExecutionContext; using ExecutionContext = framework::ExecutionContext;
using Variable = framework::Variable; using Variable = framework::Variable;
template <typename T, template <typename T, int MajorType = Eigen::RowMajor,
int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>; using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
template <typename T, template <typename T, int MajorType = Eigen::RowMajor,
int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>; using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T, template <typename T, int MajorType = Eigen::RowMajor,
int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>; using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T, template <typename T, size_t D, int MajorType = Eigen::RowMajor,
size_t D,
int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>; using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
......
...@@ -55,7 +55,7 @@ class CPUDeviceContext : public DeviceContext { ...@@ -55,7 +55,7 @@ class CPUDeviceContext : public DeviceContext {
class CUDADeviceContext : public DeviceContext { class CUDADeviceContext : public DeviceContext {
public: public:
CUDADeviceContext(GPUPlace); // NOLINT explicit CUDADeviceContext(GPUPlace);
virtual ~CUDADeviceContext(); virtual ~CUDADeviceContext();
/*! \brief Wait for all operations completion in the stream. */ /*! \brief Wait for all operations completion in the stream. */
......
...@@ -15,24 +15,28 @@ limitations under the License. */ ...@@ -15,24 +15,28 @@ limitations under the License. */
#include "paddle/platform/device_context.h" #include "paddle/platform/device_context.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
using DEVICE_GPU = Eigen::GpuDevice;
TEST(Device, Init) { TEST(Device, Init) {
using paddle::platform::DeviceContext;
using paddle::platform::CUDADeviceContext;
using paddle::platform::GPUPlace;
int count = paddle::platform::GetDeviceCount(); int count = paddle::platform::GetDeviceCount();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
paddle::platform::DeviceContext* device_context = DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
new paddle::platform::CUDADeviceContext(i);
Eigen::GpuDevice* gpu_device = Eigen::GpuDevice* gpu_device =
device_context->template get_eigen_device<DEVICE_GPU>(); device_context->template get_eigen_device<Eigen::GpuDevice>();
ASSERT_NE(nullptr, gpu_device); ASSERT_NE(nullptr, gpu_device);
delete device_context; delete device_context;
} }
} }
TEST(Device, CUDADeviceContext) { TEST(Device, CUDADeviceContext) {
using paddle::platform::CUDADeviceContext;
using paddle::platform::GPUPlace;
int count = paddle::platform::GetDeviceCount(); int count = paddle::platform::GetDeviceCount();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
paddle::platform::CUDADeviceContext* device_context = CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
new paddle::platform::CUDADeviceContext(i);
Eigen::GpuDevice* gpu_device = device_context->eigen_device(); Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device); ASSERT_NE(nullptr, gpu_device);
cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
......
...@@ -32,7 +32,7 @@ struct CPUPlace { ...@@ -32,7 +32,7 @@ struct CPUPlace {
struct GPUPlace { struct GPUPlace {
GPUPlace() : GPUPlace(0) {} GPUPlace() : GPUPlace(0) {}
GPUPlace(int d) : device(d) {} // NOLINT explicit GPUPlace(int d) : device(d) {}
// needed for variant equality comparison // needed for variant equality comparison
inline bool operator==(const GPUPlace &o) const { return device == o.device; } inline bool operator==(const GPUPlace &o) const { return device == o.device; }
......
configure_file(submit_local.sh.in configure_file(submit_local.sh.in
submit_local.sh paddle
@ONLY) @ONLY)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle DESTINATION bin
PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
RENAME paddle)
configure_file(tools/usage_stat/usage.sh configure_file(tools/usage_stat/usage.sh
usage.sh paddle_usage
@ONLY) @ONLY)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/usage.sh DESTINATION opt/paddle/bin install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle_usage DESTINATION opt/paddle/bin
PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
RENAME paddle_usage)
...@@ -39,6 +39,10 @@ Configuring cmake in /paddle/build ... ...@@ -39,6 +39,10 @@ Configuring cmake in /paddle/build ...
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
======================================== ========================================
EOF EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because
# docker environment is fully controlled by this script.
# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
cmake .. \ cmake .. \
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=Release \
-DWITH_DOC=OFF \ -DWITH_DOC=OFF \
...@@ -52,39 +56,43 @@ cmake .. \ ...@@ -52,39 +56,43 @@ cmake .. \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
cat <<EOF cat <<EOF
======================================== ============================================
Building in /paddle/build ... Building in /paddle/build ...
Build unit tests: ${WITH_TESTING:-OFF} Build unit tests: ${WITH_TESTING:-OFF}
======================================== ============================================
EOF EOF
make -j `nproc` make -j `nproc`
if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
pip uninstall -y py-paddle paddle || true
ctest --output-on-failure
fi
if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Installing ... Running unit tests ...
======================================== ========================================
EOF EOF
make install -j `nproc` # make install should also be test when unittest
pip install /usr/local/opt/paddle/share/wheels/*.whl make install -j `nproc`
paddle version pip install /usr/local/opt/paddle/share/wheels/*.whl
paddle version
ctest --output-on-failure
fi
# To build documentation, we need to run cmake again after installing # To build documentation, we need to run cmake again after installing
# PaddlePaddle. This awkwardness is due to # PaddlePaddle. This awkwardness is due to
# https://github.com/PaddlePaddle/Paddle/issues/1854. It also # https://github.com/PaddlePaddle/Paddle/issues/1854. It also
# describes a solution. # describes a solution.
if [[ ${WITH_DOC} == "ON" ]]; then if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
cat <<EOF cat <<EOF
======================================== ========================================
Building documentation ... Building documentation ...
In /paddle/build_doc In /paddle/build_doc
======================================== ========================================
EOF EOF
# build documentation need install Paddle before
make install -j `nproc`
pip install /usr/local/opt/paddle/share/wheels/*.whl
paddle version
mkdir -p /paddle/build_doc mkdir -p /paddle/build_doc
pushd /paddle/build_doc pushd /paddle/build_doc
cmake .. \ cmake .. \
...@@ -117,13 +125,22 @@ fi ...@@ -117,13 +125,22 @@ fi
# generate deb package for current build # generate deb package for current build
# FIXME(typhoonzero): should we remove paddle/scripts/deb ? # FIXME(typhoonzero): should we remove paddle/scripts/deb ?
cat <<EOF if [[ ${WITH_DEB:-OFF} == "ON" ]]; then
cat <<EOF
======================================== ========================================
Generating .deb package ... Generating .deb package ...
======================================== ========================================
EOF EOF
cpack -D CPACK_GENERATOR='DEB' -j `nproc` .. set +e
cpack -D CPACK_GENERATOR='DEB' -j `nproc` ..
err_code=$?
if [ ${err_code} -ne 0 ]; then
# cat error logs if cpack failed.
cat /paddle/build/_CPack_Packages/Linux/DEB/PreinstallOutput.log
exit ${err_code}
fi
set -e
fi
cat <<EOF cat <<EOF
======================================== ========================================
......
...@@ -20,4 +20,4 @@ cmake -DCMAKE_SYSTEM_NAME=Android \ ...@@ -20,4 +20,4 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
-DWITH_SWIG_PY=OFF \ -DWITH_SWIG_PY=OFF \
.. ..
make -j `nproc` make -j `nproc`
make install make install -j `nproc`
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pushd `dirname $0` > /dev/null
SCRIPTPATH=$PWD
popd > /dev/null
USE_VIRTUALENV_FOR_TEST=$1; shift
PYTHON=$1; shift
if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then
rm -rf .test_env
virtualenv .test_env
unset PYTHONHOME
unset PYTHONPATH
source .test_env/bin/activate
PYTHON=python
fi
$PYTHON -m pip install $SCRIPTPATH/../dist/*.whl
if [ "X${PADDLE_PACKAGE_DIR}" != "X" ]; then
$PYTHON -m pip install ${PADDLE_PACKAGE_DIR}/*.whl
else
export PYTHONPATH=$SCRIPTPATH/../../python/
fi
$PYTHON -m pip install ipython==5.3
for fn in "$@"
do
echo "test $fn"
$PYTHON $fn
if [ $? -ne 0 ]; then
exit 1
fi
done
if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then
deactivate
rm -rf .test_env
fi
文件模式从 100644 更改为 100755
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from setuptools import setup, Extension
setup(name="py_paddle",
version="${PADDLE_VERSION}",
packages=['py_paddle'],
include_package_data=True,
package_data={'py_paddle':['*.py','_swig_paddle.so']},
install_requires = [
'nltk>=3.2.2',
'numpy>=1.8.0', # The numpy is required.
'protobuf==${PROTOBUF_VERSION}' # The paddle protobuf version
],
url='http://www.paddlepaddle.org/',
license='Apache 2.0',
)
...@@ -39,8 +39,8 @@ public: ...@@ -39,8 +39,8 @@ public:
// size_ is 0. // size_ is 0.
Piece(); Piece();
Piece(const char* d, size_t n); Piece(const char* d, size_t n);
Piece(const char* d); // NOLINT Piece(const char* d); // NOLINT: accept C string into Piece.
Piece(const std::string& s); // NOLINT Piece(const std::string& s); // NOLINT: accept C++ string into Piece.
const char* data() const { return data_; } const char* data() const { return data_; }
size_t len() const { return size_; } size_t len() const { return size_; }
......
./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto ./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
#edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
# Note: when making change to this file, please make sure
# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
# for comparing these two nets can pass (test_CompareTwoNets)
default_initial_std(0.1)
default_device(0)
word_dim = 999
l1 = 0
l2 = 0
model_type("nn")
sparse_update = get_config_arg("sparse_update", bool, False)
TrainData(ProtoData(
type = "proto_sequence",
files = ('trainer/tests/train_sparse.list'),
))
Settings(
algorithm='sgd',
batch_size=100,
learning_rate=0.0001,
learning_rate_decay_a=4e-08,
learning_rate_decay_b=0.0,
learning_rate_schedule='poly',
)
wordvec_dim = 32
layer2_dim = 16
layer3_dim = 16
hidden_dim = 32
slot_names = ["qb", "qw", "tb", "tw"]
def ltr_network(network_name,
word_dim=word_dim,
wordvec_dim=wordvec_dim,
layer2_dim=layer2_dim,
layer3_dim=layer3_dim,
hidden_dim=hidden_dim,
slot_names=slot_names,
l1=l1,
l2=l2):
slotnum = len(slot_names)
for i in xrange(slotnum):
Inputs(slot_names[i] + network_name)
for i in xrange(slotnum):
Layer(
name = slot_names[i] + network_name,
type = "data",
size = word_dim,
device = -1,
)
Layer(
name = slot_names[i] + "_embedding_" + network_name,
type = "mixed",
size = wordvec_dim,
bias = False,
device = -1,
inputs = TableProjection(slot_names[i] + network_name,
parameter_name = "embedding.w0",
decay_rate_l1=l1,
sparse_remote_update = True,
sparse_update = sparse_update,
),
)
Layer(
name = slot_names[i] + "_rnn1_" + network_name,
type = "recurrent",
active_type = "tanh",
bias = Bias(initial_std = 0,
parameter_name = "rnn1.bias"),
inputs = Input(slot_names[i] + "_embedding_" + network_name,
parameter_name = "rnn1.w0")
)
Layer(
name = slot_names[i] + "_rnnlast_" + network_name,
type = "seqlastins",
inputs = [
slot_names[i] + "_rnn1_" + network_name,
],
)
Layer(
name = "layer2_" + network_name,
type = "fc",
active_type = "tanh",
size = layer2_dim,
bias = Bias(parameter_name = "layer2.bias"),
inputs = [Input(slot_name + "_rnnlast_" + network_name,
parameter_name = "_layer2_" + slot_name + ".w",
decay_rate = l2,
initial_smart = True) for slot_name in slot_names]
)
Layer(
name = "layer3_" + network_name,
type = "fc",
active_type = "tanh",
size = layer3_dim,
bias = Bias(parameter_name = "layer3.bias"),
inputs = [
Input("layer2_" + network_name,
parameter_name = "_layer3.w",
decay_rate = l2,
initial_smart = True),
]
)
Layer(
name = "output_" + network_name,
type = "fc",
size = 1,
bias = False,
inputs = [
Input("layer3_" + network_name,
parameter_name = "_layerO.w"),
],
)
ltr_network("left")
ltr_network("right")
Inputs("label")
Layer(
name = "label",
type = "data",
size = 1,
)
Outputs("cost", "qb_rnnlast_left")
Layer(
name = "cost",
type = "rank-cost",
inputs = ["output_left", "output_right", "label"],
)
from paddle.trainer_config_helpers import * from paddle.trainer_config_helpers import *
settings(batch_size=128, learning_method=AdaGradOptimizer(), learning_rate=1e-4) settings(batch_size=17, learning_method=AdaGradOptimizer(), learning_rate=1e-4)
file_list = 'trainer/tests/fake_file_list.list' file_list = 'trainer/tests/fake_file_list.list'
...@@ -12,7 +12,7 @@ define_py_data_sources2( ...@@ -12,7 +12,7 @@ define_py_data_sources2(
embedding = embedding_layer( embedding = embedding_layer(
input=data_layer( input=data_layer(
name="word_ids", size=65536), name="word_ids", size=8191),
size=128, size=128,
param_attr=ParamAttr(sparse_update=True)) param_attr=ParamAttr(sparse_update=True))
prediction = fc_layer(input=embedding, size=10, act=SoftmaxActivation()) prediction = fc_layer(input=embedding, size=10, act=SoftmaxActivation())
......
...@@ -7,15 +7,15 @@ def init_hook(settings, is_train, **kwargs): ...@@ -7,15 +7,15 @@ def init_hook(settings, is_train, **kwargs):
@provider( @provider(
input_types={'word_ids': integer_value(65536), input_types={'word_ids': integer_value(8191),
'label': integer_value(10)}, 'label': integer_value(10)},
min_pool_size=0, min_pool_size=0,
init_hook=init_hook) init_hook=init_hook)
def process(settings, filename): def process(settings, filename):
if settings.is_train: if settings.is_train:
data_size = 2**20
else:
data_size = 2**10 data_size = 2**10
else:
data_size = 2**5
for _ in xrange(data_size): for _ in xrange(data_size):
yield random.randint(0, 65535), random.randint(0, 9) yield random.randint(0, 8190), random.randint(0, 9)
...@@ -23,7 +23,7 @@ using namespace paddle; // NOLINT ...@@ -23,7 +23,7 @@ using namespace paddle; // NOLINT
using namespace std; // NOLINT using namespace std; // NOLINT
static const string& configFile1 = static const string& configFile1 =
"trainer/tests/sample_trainer_config_qb_rnn.conf"; "trainer/tests/sample_trainer_config_compare_sparse.conf";
DECLARE_bool(use_gpu); DECLARE_bool(use_gpu);
DECLARE_string(config); DECLARE_string(config);
......
...@@ -100,25 +100,25 @@ TEST(average_window, gpu) { ...@@ -100,25 +100,25 @@ TEST(average_window, gpu) {
} }
TEST(average_window, gpu2) { TEST(average_window, gpu2) {
FLAGS_num_passes = 100; FLAGS_num_passes = 20;
trainerOnePassTest(configFile1, true, false, 2, 0.01); trainerOnePassTest(configFile1, true, false, 2, 0.01);
FLAGS_num_passes = 1; FLAGS_num_passes = 1;
} }
TEST(average_window, gpu4) { TEST(average_window, gpu4) {
FLAGS_num_passes = 100; FLAGS_num_passes = 20;
trainerOnePassTest(configFile1, true, false, 4, 0.01); trainerOnePassTest(configFile1, true, false, 4, 0.01);
FLAGS_num_passes = 1; FLAGS_num_passes = 1;
} }
TEST(average_window_cpu, gpu2) { TEST(average_window_cpu, gpu2) {
FLAGS_num_passes = 100; FLAGS_num_passes = 20;
trainerOnePassTest(configFile1, true, false, 2, 0.01, true); trainerOnePassTest(configFile1, true, false, 2, 0.01, true);
FLAGS_num_passes = 1; FLAGS_num_passes = 1;
} }
TEST(average_window_cpu, gpu4) { TEST(average_window_cpu, gpu4) {
FLAGS_num_passes = 100; FLAGS_num_passes = 20;
trainerOnePassTest(configFile1, true, false, 4, 0.01, true); trainerOnePassTest(configFile1, true, false, 4, 0.01, true);
FLAGS_num_passes = 1; FLAGS_num_passes = 1;
} }
......
trainer/tests/compare_sparse_data
...@@ -15,14 +15,13 @@ syntax = "proto2"; ...@@ -15,14 +15,13 @@ syntax = "proto2";
package paddle; package paddle;
message FileGroupConf { message FileGroupConf {
optional uint32 queue_capacity = 1 [default = 1]; optional uint32 queue_capacity = 1 [ default = 1 ];
// how many files to load for a load file thread // how many files to load for a load file thread
optional int32 load_file_count = 2 [default = 1]; optional int32 load_file_count = 2 [ default = 1 ];
// how many threads to load files // how many threads to load files
// Setting to be 5~10 is appropriate when loading files by hadoop vfs // Setting to be 5~10 is appropriate when loading files by hadoop vfs
optional int32 load_thread_num = 3 [default = 1]; optional int32 load_thread_num = 3 [ default = 1 ];
}; };
message DataConfig { message DataConfig {
...@@ -32,26 +31,28 @@ message DataConfig { ...@@ -32,26 +31,28 @@ message DataConfig {
// name of a text file which contains a list of file names at each line // name of a text file which contains a list of file names at each line
optional string files = 3; optional string files = 3;
optional int32 feat_dim = 4;//feature dimension of one frame optional int32 feat_dim = 4; // feature dimension of one frame
repeated int32 slot_dims = 5;//feature slot dims repeated int32 slot_dims = 5; // feature slot dims
optional int32 context_len = 6;//max neibour frame numbers optional int32 context_len = 6; // max neibour frame numbers
optional uint64 buffer_capacity = 7;//the number of samples optional uint64 buffer_capacity = 7; // the number of samples
//part of data used in training // part of data used in training
//if not -1, part of train data is used in training // if not -1, part of train data is used in training
optional int64 train_sample_num = 8 [default = -1]; optional int64 train_sample_num = 8 [ default = -1 ];
//The number of documents processed once // The number of documents processed once
optional int32 file_load_num = 9 [default = -1]; optional int32 file_load_num = 9 [ default = -1 ];
optional bool async_load_data = 12 [default = false]; optional bool async_load_data = 12 [ default = false ];
/// Note the field number 10, 11 and 13 have been deprecated. /// Note the field number 10, 11 and 13 have been deprecated.
optional bool for_test = 14 [default = false]; // whether this data is for test optional bool for_test = 14
[ default = false ]; // whether this data is for test
optional FileGroupConf file_group_conf = 15; optional FileGroupConf file_group_conf = 15;
repeated int32 float_slot_dims = 16; repeated int32 float_slot_dims = 16;
/// Note the field number 17, 18 and 19 have been deprecated. /// Note the field number 17, 18 and 19 have been deprecated.
// a list of values which will be used to create additional one dimensional float // a list of values which will be used to create additional one dimensional
// float
// values slots. These one dimensional slots can be used as the weight input // values slots. These one dimensional slots can be used as the weight input
// for cost layers. // for cost layers.
// Currently this is only supported by ProtoDataProvider. // Currently this is only supported by ProtoDataProvider.
...@@ -65,21 +66,21 @@ message DataConfig { ...@@ -65,21 +66,21 @@ message DataConfig {
// for MultiDataProvider // for MultiDataProvider
repeated DataConfig sub_data_configs = 24; // sub dataproviders repeated DataConfig sub_data_configs = 24; // sub dataproviders
/* /*
* the ratio of each sub dataproviders: * the ratio of each sub dataproviders:
* e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100, * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
* then each mini-batch is combined by 10 instance from A and 90 instances * then each mini-batch is combined by 10 instance from A and 90 instances
* from B. * from B.
*/ */
optional int32 data_ratio = 25; optional int32 data_ratio = 25;
/* /*
* if one of the sub dataproviders is running out of data, then * if one of the sub dataproviders is running out of data, then
* (1) it is "main data", then finish current pass. * (1) it is "main data", then finish current pass.
* (2) it is not "main data", then reset it, and try getNextBatch again. * (2) it is not "main data", then reset it, and try getNextBatch again.
*/ */
optional bool is_main_data = 26 [default = true]; optional bool is_main_data = 26 [ default = true ];
// the usage ratio of instances. Setting to 1.0 means the use of all instances. // the usage ratio of instances. Setting to 1.0 means the use of all
optional double usage_ratio = 27 [default = 1.0]; // instances.
optional double usage_ratio = 27 [ default = 1.0 ];
}; };
...@@ -17,27 +17,32 @@ package paddle; ...@@ -17,27 +17,32 @@ package paddle;
/* /*
If values is not empty and ids is empty, this is a dense vector. If values is not empty and ids is empty, this is a dense vector.
If values is not empty and ids is not empty, this is a sparse vector. The position of each value If values is not empty and ids is not empty, this is a sparse vector. The
position of each value
is specified by ids. is specified by ids.
If values is empty and ids is not empty, this is a sparse vector whose non-zero values are 1. If values is empty and ids is not empty, this is a sparse vector whose non-zero
values are 1.
The position of each 1 is specified by ids. The position of each 1 is specified by ids.
*/ */
message VectorSlot { message VectorSlot {
repeated float values = 1 [packed = true]; repeated float values = 1 [ packed = true ];
repeated uint32 ids = 2 [packed = true]; repeated uint32 ids = 2 [ packed = true ];
/* For multidimensional data, for example "image width height depth" */ /* For multidimensional data, for example "image width height depth" */
repeated uint32 dims = 3 [packed = true]; repeated uint32 dims = 3 [ packed = true ];
repeated string strs = 4; repeated string strs = 4;
}; };
/* /*
SubseqSlot use to record whether VectorSlot or any other slot in future has subseq. SubseqSlot use to record whether VectorSlot or any other slot in future has
If not all VectorSlot have subseq, we only store the one who has subseq, and use *slot_id* to record it. subseq.
One vector_slots has one sequence, and it may have N subseq, thus the number of *lens* will be N too. If not all VectorSlot have subseq, we only store the one who has subseq, and
use *slot_id* to record it.
One vector_slots has one sequence, and it may have N subseq, thus the number of
*lens* will be N too.
*/ */
message SubseqSlot { message SubseqSlot {
required uint32 slot_id = 1; //the id of slot who has subseq required uint32 slot_id = 1; // the id of slot who has subseq
repeated uint32 lens = 2; // lengths of sub-sequence in the slot repeated uint32 lens = 2; // lengths of sub-sequence in the slot
}; };
message SlotDef { message SlotDef {
...@@ -45,13 +50,14 @@ message SlotDef { ...@@ -45,13 +50,14 @@ message SlotDef {
VECTOR_DENSE = 0; VECTOR_DENSE = 0;
VECTOR_SPARSE_NON_VALUE = 1; VECTOR_SPARSE_NON_VALUE = 1;
VECTOR_SPARSE_VALUE = 2; VECTOR_SPARSE_VALUE = 2;
INDEX = 3; // This can be used as label, or word id, etc. INDEX = 3; // This can be used as label, or word id, etc.
VAR_MDIM_DENSE = 4; VAR_MDIM_DENSE = 4;
VAR_MDIM_INDEX = 5; VAR_MDIM_INDEX = 5;
STRING = 6; STRING = 6;
} }
required SlotType type = 1; required SlotType type = 1;
required uint32 dim = 2; // For INDEX slots, this means the maximal index plus 1. required uint32 dim =
2; // For INDEX slots, this means the maximal index plus 1.
}; };
message DataHeader { message DataHeader {
...@@ -60,11 +66,11 @@ message DataHeader { ...@@ -60,11 +66,11 @@ message DataHeader {
}; };
message DataSample { message DataSample {
optional bool is_beginning = 1 [default = true]; // is the beginning of a sequence optional bool is_beginning = 1
[ default = true ]; // is the beginning of a sequence
repeated VectorSlot vector_slots = 2; repeated VectorSlot vector_slots = 2;
repeated uint32 id_slots = 3 [packed = true]; repeated uint32 id_slots = 3 [ packed = true ];
/* use ids of VectorSlot */ /* use ids of VectorSlot */
repeated VectorSlot var_id_slots = 4; repeated VectorSlot var_id_slots = 4;
repeated SubseqSlot subseq_slots = 5; repeated SubseqSlot subseq_slots = 5;
}; };
...@@ -21,7 +21,6 @@ package paddle; ...@@ -21,7 +21,6 @@ package paddle;
* Various structs for the configuration of a neural network * Various structs for the configuration of a neural network
*/ */
message ExternalConfig { message ExternalConfig {
repeated string layer_names = 1; repeated string layer_names = 1;
repeated string input_layer_names = 2; repeated string input_layer_names = 2;
...@@ -68,7 +67,7 @@ message ConvConfig { ...@@ -68,7 +67,7 @@ message ConvConfig {
required uint32 img_size = 8; required uint32 img_size = 8;
// caffe mode for output size coherence // caffe mode for output size coherence
required bool caffe_mode = 9 [default = true]; required bool caffe_mode = 9 [ default = true ];
// if filter_size_y is set , this convolutional layer will use // if filter_size_y is set , this convolutional layer will use
// filters of size filter_size * filter_size_y pixels. // filters of size filter_size * filter_size_y pixels.
...@@ -99,7 +98,7 @@ message PoolConfig { ...@@ -99,7 +98,7 @@ message PoolConfig {
optional uint32 start = 4; optional uint32 start = 4;
// Defines the stride size between successive pooling squares. // Defines the stride size between successive pooling squares.
required uint32 stride = 5 [default = 1]; required uint32 stride = 5 [ default = 1 ];
// The size of output feature map. // The size of output feature map.
required uint32 output_x = 6; required uint32 output_x = 6;
...@@ -109,7 +108,7 @@ message PoolConfig { ...@@ -109,7 +108,7 @@ message PoolConfig {
// padding = 4, instructs the net to implicitly // padding = 4, instructs the net to implicitly
// pad the images with a 4-pixel border of zeros. // pad the images with a 4-pixel border of zeros.
optional uint32 padding = 8 [default = 0]; optional uint32 padding = 8 [ default = 0 ];
// if not set, use size_x // if not set, use size_x
optional uint32 size_y = 9; optional uint32 size_y = 9;
...@@ -194,9 +193,7 @@ message MaxOutConfig { ...@@ -194,9 +193,7 @@ message MaxOutConfig {
required uint32 groups = 2; required uint32 groups = 2;
} }
message RowConvConfig { message RowConvConfig { required uint32 context_length = 1; }
required uint32 context_length = 1;
}
message SliceConfig { message SliceConfig {
required uint32 start = 1; required uint32 start = 1;
...@@ -212,14 +209,14 @@ message ProjectionConfig { ...@@ -212,14 +209,14 @@ message ProjectionConfig {
// For ShiftProjection // For ShiftProjection
optional int32 context_start = 5; optional int32 context_start = 5;
optional int32 context_length = 6; optional int32 context_length = 6;
optional bool trainable_padding = 7 [default = false]; optional bool trainable_padding = 7 [ default = false ];
// For convolution // For convolution
optional ConvConfig conv_conf = 8; optional ConvConfig conv_conf = 8;
optional int32 num_filters = 9; optional int32 num_filters = 9;
// For IdentityOffsetProjection // For IdentityOffsetProjection
optional uint64 offset = 11 [default = 0]; optional uint64 offset = 11 [ default = 0 ];
// For pool // For pool
optional PoolConfig pool_conf = 12; optional PoolConfig pool_conf = 12;
...@@ -236,7 +233,7 @@ message OperatorConfig { ...@@ -236,7 +233,7 @@ message OperatorConfig {
required uint64 output_size = 4; required uint64 output_size = 4;
// For DotMulOperator // For DotMulOperator
optional double dotmul_scale = 5 [default = 1.0]; optional double dotmul_scale = 5 [ default = 1.0 ];
// For ConvOperator // For ConvOperator
optional ConvConfig conv_conf = 6; optional ConvConfig conv_conf = 6;
...@@ -282,8 +279,8 @@ message MultiBoxLossConfig { ...@@ -282,8 +279,8 @@ message MultiBoxLossConfig {
required float neg_overlap = 4; required float neg_overlap = 4;
required uint32 background_id = 5; required uint32 background_id = 5;
required uint32 input_num = 6; required uint32 input_num = 6;
optional uint32 height = 7 [default = 1]; optional uint32 height = 7 [ default = 1 ];
optional uint32 width = 8 [default = 1]; optional uint32 width = 8 [ default = 1 ];
} }
message DetectionOutputConfig { message DetectionOutputConfig {
...@@ -294,8 +291,8 @@ message DetectionOutputConfig { ...@@ -294,8 +291,8 @@ message DetectionOutputConfig {
required uint32 input_num = 5; required uint32 input_num = 5;
required uint32 keep_top_k = 6; required uint32 keep_top_k = 6;
required float confidence_threshold = 7; required float confidence_threshold = 7;
optional uint32 height = 8 [default = 1]; optional uint32 height = 8 [ default = 1 ];
optional uint32 width = 9 [default = 1]; optional uint32 width = 9 [ default = 1 ];
} }
message ClipConfig { message ClipConfig {
...@@ -331,7 +328,7 @@ message LayerConfig { ...@@ -331,7 +328,7 @@ message LayerConfig {
required string name = 1; required string name = 1;
required string type = 2; required string type = 2;
optional uint64 size = 3; optional uint64 size = 3;
//optional ActivationConfig activation = 4; // optional ActivationConfig activation = 4;
optional string active_type = 4; optional string active_type = 4;
repeated LayerInputConfig inputs = 5; repeated LayerInputConfig inputs = 5;
optional string bias_parameter_name = 6; optional string bias_parameter_name = 6;
...@@ -344,7 +341,7 @@ message LayerConfig { ...@@ -344,7 +341,7 @@ message LayerConfig {
// (which is how convnets are usually trained). Setting this to // (which is how convnets are usually trained). Setting this to
// false will untie the biases, yielding a separate bias for // false will untie the biases, yielding a separate bias for
// every location at which the filter is applied. // every location at which the filter is applied.
optional bool shared_biases = 8 [default = false]; optional bool shared_biases = 8 [ default = false ];
// Valid values are ones that divide the area of the output // Valid values are ones that divide the area of the output
// grid in this convolutional layer. For example if this layer // grid in this convolutional layer. For example if this layer
...@@ -362,33 +359,35 @@ message LayerConfig { ...@@ -362,33 +359,35 @@ message LayerConfig {
// the gpu device which the Layer's data in. // the gpu device which the Layer's data in.
// Only used by ParallelNeuralNetork. Ignored otherwise. // Only used by ParallelNeuralNetork. Ignored otherwise.
optional int32 device = 12 [default = -1]; optional int32 device = 12 [ default = -1 ];
// for recurrent layer. If true, the recurrence runs from the end to the beginning. // for recurrent layer. If true, the recurrence runs from the end to the
optional bool reversed = 13 [default = false]; // beginning.
optional bool reversed = 13 [ default = false ];
// for lstmemory layer. Different types of nodes have different activation type. // for lstmemory layer. Different types of nodes have different activation
optional string active_gate_type = 14; // type.
optional string active_gate_type = 14;
optional string active_state_type = 15; optional string active_state_type = 15;
// For NCELayer // For NCELayer
// The number of random negative labels for each sample // The number of random negative labels for each sample
optional int32 num_neg_samples = 16 [default = 10]; optional int32 num_neg_samples = 16 [ default = 10 ];
// For NCELayer // For NCELayer
// The distribution for generating the random negative labels. // The distribution for generating the random negative labels.
// A uniform distribution will be used if not provided // A uniform distribution will be used if not provided
repeated double neg_sampling_dist = 17 [packed = true]; repeated double neg_sampling_dist = 17 [ packed = true ];
// For MaxLayer // For MaxLayer
// default: output VALUE of MaxLayer. set this flag to true for output INDEX // default: output VALUE of MaxLayer. set this flag to true for output INDEX
// INDEX will be put in Argument::value as double values. // INDEX will be put in Argument::value as double values.
optional bool output_max_index = 19 [default = false]; optional bool output_max_index = 19 [ default = false ];
/// The filed number 20 have been deprecated. /// The filed number 20 have been deprecated.
// For self-normalized estimation // For self-normalized estimation
optional double softmax_selfnorm_alpha = 21 [default = 0.1]; optional double softmax_selfnorm_alpha = 21 [ default = 0.1 ];
/// The filed numbers 22 and 23 have been deprecated. /// The filed numbers 22 and 23 have been deprecated.
...@@ -399,14 +398,14 @@ message LayerConfig { ...@@ -399,14 +398,14 @@ message LayerConfig {
optional bool norm_by_times = 25; optional bool norm_by_times = 25;
// for CostLayers // for CostLayers
optional double coeff = 26 [default = 1.0]; optional double coeff = 26 [ default = 1.0 ];
// for AverageLayer // for AverageLayer
// can be set to: 'average', 'sum' or 'squarerootn' // can be set to: 'average', 'sum' or 'squarerootn'
optional string average_strategy = 27; optional string average_strategy = 27;
// for error clipping // for error clipping
optional double error_clipping_threshold = 28 [default = 0.0]; optional double error_clipping_threshold = 28 [ default = 0.0 ];
// for operators used by mixed layer // for operators used by mixed layer
repeated OperatorConfig operator_confs = 29; repeated OperatorConfig operator_confs = 29;
...@@ -434,43 +433,44 @@ message LayerConfig { ...@@ -434,43 +433,44 @@ message LayerConfig {
optional uint32 beam_size = 39; optional uint32 beam_size = 39;
// for seqlastins layer, whether select first instead last // for seqlastins layer, whether select first instead last
optional bool select_first = 40 [default = false]; optional bool select_first = 40 [ default = false ];
// for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer // for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer
// can be set to: 'non-seq','seq' // can be set to: 'non-seq','seq'
optional string trans_type = 41 [default = 'non-seq']; optional string trans_type = 41 [ default = 'non-seq' ];
// to indicate whether selective_fc layer // to indicate whether selective_fc layer
// is used in sequence generation or not // is used in sequence generation or not
optional bool selective_fc_pass_generation = 42 [default = false]; optional bool selective_fc_pass_generation = 42 [ default = false ];
// to indicate whether selective_fc layer take its last input to // to indicate whether selective_fc layer take its last input to
// selected several columns and only compute the multiplications // selected several columns and only compute the multiplications
// between the input matrices and the selected columns of // between the input matrices and the selected columns of
// the parameter matrices of this layer. // the parameter matrices of this layer.
// if set false, selective_fc degrades into fc. // if set false, selective_fc degrades into fc.
optional bool has_selected_colums = 43 [default = true]; optional bool has_selected_colums = 43 [ default = true ];
// this parameter is for speed consideration. // this parameter is for speed consideration.
// if number of the selected columns is less than // if number of the selected columns is less than
// sample number * selective_fc output size * selective_fc_mull_mull_ratio // sample number * selective_fc output size * selective_fc_mull_mull_ratio
// sparse multiplication is used, otherwise, using full multiplication. // sparse multiplication is used, otherwise, using full multiplication.
optional double selective_fc_full_mul_ratio = 44 [default = 0.02]; optional double selective_fc_full_mul_ratio = 44 [ default = 0.02 ];
// to indicate how many threads selective_fc use to to accelate // to indicate how many threads selective_fc use to to accelate
// the plain_mul period // the plain_mul period
// leave empty or set to 0 to disable multi-thread accleleration // leave empty or set to 0 to disable multi-thread accleleration
optional uint32 selective_fc_parallel_plain_mul_thread_num = 45 [default = 0]; optional uint32 selective_fc_parallel_plain_mul_thread_num = 45
[ default = 0 ];
// for batch normalization layer // for batch normalization layer
// if set use_global_stats true, will use the loaded mean and variance. // if set use_global_stats true, will use the loaded mean and variance.
optional bool use_global_stats = 46; optional bool use_global_stats = 46;
// use to compute moving mean and variance. // use to compute moving mean and variance.
optional double moving_average_fraction = 47 [default = 0.9]; optional double moving_average_fraction = 47 [ default = 0.9 ];
// bias size // bias size
optional uint32 bias_size = 48 [default = 0]; optional uint32 bias_size = 48 [ default = 0 ];
// this parameter can be used as a user-defined parameter when necessary, // this parameter can be used as a user-defined parameter when necessary,
// without changing the proto file. // without changing the proto file.
...@@ -485,18 +485,17 @@ message LayerConfig { ...@@ -485,18 +485,17 @@ message LayerConfig {
optional uint64 width = 51; optional uint64 width = 51;
// blank label used in ctc loss // blank label used in ctc loss
optional uint32 blank = 52 [default = 0]; optional uint32 blank = 52 [ default = 0 ];
// stride parameter for seqlastins layer, AverageLayer, MaxLayer, which // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which
// controls the scope of pooling operation. can be set > 0. // controls the scope of pooling operation. can be set > 0.
// leave empty or set to -1 to disable this stride pooling. // leave empty or set to -1 to disable this stride pooling.
optional int32 seq_pool_stride = 53 [default = -1]; optional int32 seq_pool_stride = 53 [ default = -1 ];
// for crop layer // for crop layer
optional int32 axis = 54 [default = 2]; optional int32 axis = 54 [ default = 2 ];
repeated uint32 offset = 55; repeated uint32 offset = 55;
repeated uint32 shape = 56; repeated uint32 shape = 56;
} }
message EvaluatorConfig { message EvaluatorConfig {
...@@ -512,9 +511,9 @@ message EvaluatorConfig { ...@@ -512,9 +511,9 @@ message EvaluatorConfig {
// Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator // Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator
// For multi binary labels: true if output > classification_threshold // For multi binary labels: true if output > classification_threshold
optional double classification_threshold = 6 [default = 0.5]; optional double classification_threshold = 6 [ default = 0.5 ];
// The positive label. -1 means average precision and recall // The positive label. -1 means average precision and recall
optional int32 positive_label = 7 [default = -1]; optional int32 positive_label = 7 [ default = -1 ];
// load dict from this file // load dict from this file
optional string dict_file = 8; optional string dict_file = 8;
...@@ -523,10 +522,10 @@ message EvaluatorConfig { ...@@ -523,10 +522,10 @@ message EvaluatorConfig {
optional string result_file = 9; optional string result_file = 9;
// top # results for max id printer // top # results for max id printer
optional int32 num_results = 10 [default = 1]; optional int32 num_results = 10 [ default = 1 ];
// whether to delimit the sequence in the seq_text_printer // whether to delimit the sequence in the seq_text_printer
optional bool delimited = 11 [default = true]; optional bool delimited = 11 [ default = true ];
// Used by ChunkEvaluator // Used by ChunkEvaluator
// chunk of these types are not counted // chunk of these types are not counted
...@@ -534,23 +533,23 @@ message EvaluatorConfig { ...@@ -534,23 +533,23 @@ message EvaluatorConfig {
// Used by ClassificationErrorEvaluator // Used by ClassificationErrorEvaluator
// top # classification error // top # classification error
optional int32 top_k = 13 [default = 1]; optional int32 top_k = 13 [ default = 1 ];
// Used by DetectionMAPEvaluator // Used by DetectionMAPEvaluator
optional double overlap_threshold = 14 [default = 0.5]; optional double overlap_threshold = 14 [ default = 0.5 ];
optional int32 background_id = 15 [default = 0]; optional int32 background_id = 15 [ default = 0 ];
optional bool evaluate_difficult = 16 [default = false]; optional bool evaluate_difficult = 16 [ default = false ];
optional string ap_type = 17 [default = "11point"]; optional string ap_type = 17 [ default = "11point" ];
} }
message LinkConfig { message LinkConfig {
required string layer_name = 1; required string layer_name = 1;
required string link_name = 2; required string link_name = 2;
// If true, this link has sub-sequence // If true, this link has sub-sequence
optional bool has_subseq = 3 [default = false]; optional bool has_subseq = 3 [ default = false ];
} }
message MemoryConfig { message MemoryConfig {
...@@ -563,18 +562,18 @@ message MemoryConfig { ...@@ -563,18 +562,18 @@ message MemoryConfig {
optional uint32 boot_with_const_id = 7; optional uint32 boot_with_const_id = 7;
// memory is a sequence, initailized by a sequence boot layer // memory is a sequence, initailized by a sequence boot layer
optional bool is_sequence = 6 [default = false]; optional bool is_sequence = 6 [ default = false ];
} }
message GeneratorConfig { message GeneratorConfig {
required uint32 max_num_frames = 1; required uint32 max_num_frames = 1;
required string eos_layer_name = 2; required string eos_layer_name = 2;
optional int32 num_results_per_sample = 3 [default = 1]; optional int32 num_results_per_sample = 3 [ default = 1 ];
// for beam search // for beam search
optional int32 beam_size = 4 [default = 1]; optional int32 beam_size = 4 [ default = 1 ];
optional bool log_prob = 5 [default = true]; optional bool log_prob = 5 [ default = true ];
} }
message SubModelConfig { message SubModelConfig {
...@@ -584,10 +583,10 @@ message SubModelConfig { ...@@ -584,10 +583,10 @@ message SubModelConfig {
repeated string output_layer_names = 4; repeated string output_layer_names = 4;
repeated string evaluator_names = 5; repeated string evaluator_names = 5;
optional bool is_recurrent_layer_group = 6 [default = false]; optional bool is_recurrent_layer_group = 6 [ default = false ];
// If true, the recurrence runs from the end to the beginning. // If true, the recurrence runs from the end to the beginning.
optional bool reversed = 7 [default = false]; optional bool reversed = 7 [ default = false ];
// name and link name of memory // name and link name of memory
repeated MemoryConfig memories = 8; repeated MemoryConfig memories = 8;
...@@ -601,14 +600,15 @@ message SubModelConfig { ...@@ -601,14 +600,15 @@ message SubModelConfig {
optional GeneratorConfig generator = 11; optional GeneratorConfig generator = 11;
// the id of inlink which share info with outlinks, used in recurrent layer group // the id of inlink which share info with outlinks, used in recurrent layer
// group
optional int32 target_inlinkid = 12; optional int32 target_inlinkid = 12;
} }
message ModelConfig { message ModelConfig {
// type of the model. // type of the model.
// Currently, "nn", "recurrent_nn" and "recursive_nn" are supported // Currently, "nn", "recurrent_nn" and "recursive_nn" are supported
required string type = 1 [default = "nn"]; required string type = 1 [ default = "nn" ];
// layers should be ordered in such a way that the forward propagation // layers should be ordered in such a way that the forward propagation
// can be correctly executed by going from the first layer to the last layer // can be correctly executed by going from the first layer to the last layer
......
syntax = "proto2"; syntax = "proto2";
option optimize_for = LITE_RUNTIME; option optimize_for = LITE_RUNTIME;
package paddle; package paddle;
...@@ -9,13 +9,11 @@ message SGDConfig { ...@@ -9,13 +9,11 @@ message SGDConfig {
// momentum: float >= 0. Parameter updates momentum. // momentum: float >= 0. Parameter updates momentum.
// decay: float >= 0. Learning rate decay over each update. // decay: float >= 0. Learning rate decay over each update.
// nesterov: boolean. Whether to apply Nesterov momentum. // nesterov: boolean. Whether to apply Nesterov momentum.
optional double momentum = 21 [default = 0.0]; optional double momentum = 21 [ default = 0.0 ];
optional double decay = 23 [default = 0.0]; optional double decay = 23 [ default = 0.0 ];
optional bool nesterov =24 [default = false]; optional bool nesterov = 24 [ default = false ];
} }
message AdadeltaConfig { message AdadeltaConfig {
// Adadelta // Adadelta
// It is recommended to leave it at the default value. // It is recommended to leave it at the default value.
...@@ -23,21 +21,23 @@ message AdadeltaConfig { ...@@ -23,21 +21,23 @@ message AdadeltaConfig {
// epsilon: float >= 0. Fuzz factor. // epsilon: float >= 0. Fuzz factor.
// decay: float >= 0. Learning rate decay over each update. // decay: float >= 0. Learning rate decay over each update.
// reference : [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701) // reference : [Adadelta - an adaptive learning rate
optional double rho = 33 [default = 0.90]; // method](http://arxiv.org/abs/1212.5701)
optional double epsilon = 31 [default = 1e-5]; optional double rho = 33 [ default = 0.90 ];
optional double decay = 32 [default = 0.0]; optional double epsilon = 31 [ default = 1e-5 ];
optional double decay = 32 [ default = 0.0 ];
} }
message AdagradConfig { message AdagradConfig {
// Adagrad // Adagrad
// epsilon: float >= 0. // epsilon: float >= 0.
// decay: float >= 0. Learning rate decay over each update. // decay: float >= 0. Learning rate decay over each update.
// reference : [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) // reference : [Adaptive Subgradient Methods for Online Learning and
optional double epsilon = 41 [default = 1e-5]; // Stochastic
optional double decay = 42 [default = 0.0]; // Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
optional double epsilon = 41 [ default = 1e-5 ];
optional double decay = 42 [ default = 0.0 ];
} }
message AdamConfig { message AdamConfig {
...@@ -46,7 +46,8 @@ message AdamConfig { ...@@ -46,7 +46,8 @@ message AdamConfig {
// beta_2: float, 0 < beta < 1. Generally close to 1. // beta_2: float, 0 < beta < 1. Generally close to 1.
// epsilon: float >= 0. Fuzz factor. // epsilon: float >= 0. Fuzz factor.
// decay: float >= 0. Learning rate decay over each update. // decay: float >= 0. Learning rate decay over each update.
// reference : [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) // reference : [Adam - A Method for Stochastic
// Optimization](http://arxiv.org/abs/1412.6980v8)
optional double beta_1 = 41; optional double beta_1 = 41;
optional double beta_2 = 42; optional double beta_2 = 42;
optional double epsilon = 43; optional double epsilon = 43;
...@@ -55,32 +56,32 @@ message AdamConfig { ...@@ -55,32 +56,32 @@ message AdamConfig {
message ConstLrConfig { message ConstLrConfig {
// learninRate Policy // learninRate Policy
optional double learning_rate = 1 [default = 1.0]; optional double learning_rate = 1 [ default = 1.0 ];
} }
message LinearLrConfig { message LinearLrConfig {
// learninRate Policy // learninRate Policy
optional double learning_rate = 1 [default = 1.0]; optional double learning_rate = 1 [ default = 1.0 ];
optional double lr_decay_a = 2; optional double lr_decay_a = 2;
optional double lr_decay_b = 3; optional double lr_decay_b = 3;
} }
message TensorProto { message TensorProto {
enum DataType { enum DataType {
PADDLE_ELEMENT_TYPE_INT32 = 0; PADDLE_ELEMENT_TYPE_INT32 = 0;
PADDLE_ELEMENT_TYPE_UINT32 = 1; PADDLE_ELEMENT_TYPE_UINT32 = 1;
PADDLE_ELEMENT_TYPE_INT64 = 2; PADDLE_ELEMENT_TYPE_INT64 = 2;
PADDLE_ELEMENT_TYPE_UINT64 = 3; PADDLE_ELEMENT_TYPE_UINT64 = 3;
PADDLE_ELEMENT_TYPE_FLOAT32 = 4; PADDLE_ELEMENT_TYPE_FLOAT32 = 4;
PADDLE_ELEMENT_TYPE_FLOAT64 = 5; PADDLE_ELEMENT_TYPE_FLOAT64 = 5;
} }
optional DataType data_type = 1; optional DataType data_type = 1;
repeated bytes content = 2; repeated bytes content = 2;
} }
message LrPolicyState { message LrPolicyState {
// learninRate Policy // learninRate Policy
optional double learning_rate = 1 [default = 1.0]; optional double learning_rate = 1 [ default = 1.0 ];
optional double lr_decay_a = 2; optional double lr_decay_a = 2;
optional double lr_decay_b = 3; optional double lr_decay_b = 3;
} }
...@@ -104,7 +105,6 @@ message AdadeltaOptimizerState { ...@@ -104,7 +105,6 @@ message AdadeltaOptimizerState {
optional TensorProto update_delta = 4; optional TensorProto update_delta = 4;
} }
message AdagradOptimizerState { message AdagradOptimizerState {
optional LrPolicyState lr_state = 101; optional LrPolicyState lr_state = 101;
optional double num_sample_passed = 104; optional double num_sample_passed = 104;
...@@ -124,10 +124,10 @@ message AdamOptimizerState { ...@@ -124,10 +124,10 @@ message AdamOptimizerState {
message OptimizerConfig { message OptimizerConfig {
enum Optimizer { enum Optimizer {
SGD = 1; SGD = 1;
Adadelta = 2; Adadelta = 2;
Adagrad = 3; Adagrad = 3;
Adam = 4; Adam = 4;
} }
optional Optimizer optimizer = 1; optional Optimizer optimizer = 1;
optional SGDConfig sgd = 3; optional SGDConfig sgd = 3;
...@@ -136,8 +136,8 @@ message OptimizerConfig { ...@@ -136,8 +136,8 @@ message OptimizerConfig {
optional AdamConfig adam = 6; optional AdamConfig adam = 6;
enum LrPolicy { enum LrPolicy {
Const = 0; Const = 0;
Linear = 1; Linear = 1;
} }
optional LrPolicy lr_policy = 11; optional LrPolicy lr_policy = 11;
optional ConstLrConfig const_lr = 12; optional ConstLrConfig const_lr = 12;
......
...@@ -27,56 +27,57 @@ enum ParameterInitStrategy { ...@@ -27,56 +27,57 @@ enum ParameterInitStrategy {
message ParameterUpdaterHookConfig { message ParameterUpdaterHookConfig {
// hook type such as 'pruning' // hook type such as 'pruning'
required string type = 1; required string type = 1;
// this represents the ratio of zero element to be set by the Parameter // this represents the ratio of zero element to be set by the Parameter
optional double sparsity_ratio = 2 [default = 0.6]; optional double sparsity_ratio = 2 [ default = 0.6 ];
} }
message ParameterConfig { message ParameterConfig {
required string name = 1; required string name = 1;
required uint64 size = 2; required uint64 size = 2;
optional double learning_rate = 3 [default = 1.0]; optional double learning_rate = 3 [ default = 1.0 ];
optional double momentum = 4 [default = 0.0]; optional double momentum = 4 [ default = 0.0 ];
optional double initial_mean = 5 [default = 0.0]; optional double initial_mean = 5 [ default = 0.0 ];
optional double initial_std = 6 [default = 0.01]; optional double initial_std = 6 [ default = 0.01 ];
// use L2-regularization if decay_rate set and decay_rate_l1 not set // use L2-regularization if decay_rate set and decay_rate_l1 not set
optional double decay_rate = 7 [default = 0.0]; optional double decay_rate = 7 [ default = 0.0 ];
// use L1-regularization if decay_rate_l1 set // use L1-regularization if decay_rate_l1 set
optional double decay_rate_l1 = 8 [default = 0.0]; optional double decay_rate_l1 = 8 [ default = 0.0 ];
// dims of Parameter, e.g. dims[0] as height, dims[1] as width.. // dims of Parameter, e.g. dims[0] as height, dims[1] as width..
repeated uint64 dims = 9; repeated uint64 dims = 9;
// the gpu device which the parameter in. // the gpu device which the parameter in.
// Only used by ParallelNeuralNetork. Ignored otherwise. // Only used by ParallelNeuralNetork. Ignored otherwise.
optional int32 device = 10 [default = -1]; optional int32 device = 10 [ default = -1 ];
// how to init the parameter: 0 -> normal, 1 -> uniform // how to init the parameter: 0 -> normal, 1 -> uniform
// 0: treat initial_mean as mean, intial_std as standard deviation // 0: treat initial_mean as mean, intial_std as standard deviation
// 1: range is (initial_mean - initial_std) to (initial_mean + initial_std) // 1: range is (initial_mean - initial_std) to (initial_mean + initial_std)
optional int32 initial_strategy = 11 [default = 0]; optional int32 initial_strategy = 11 [ default = 0 ];
// define the variance when init the parameter, by height of the Matrix // define the variance when init the parameter, by height of the Matrix
optional bool initial_smart = 12 [default = false]; optional bool initial_smart = 12 [ default = false ];
// apply regularization every # batches // apply regularization every # batches
optional int32 num_batches_regularization = 13 [default = 1]; optional int32 num_batches_regularization = 13 [ default = 1 ];
// if is_sparse is true, para is sparse, else para is dense // if is_sparse is true, para is sparse, else para is dense
optional bool is_sparse = 14[default = false]; optional bool is_sparse = 14 [ default = false ];
// if para is sparse, format should be "csc" or "csr", empty means is not sparse // if para is sparse, format should be "csc" or "csr", empty means is not
optional string format = 15 [default = ""]; // sparse
optional string format = 15 [ default = "" ];
// sparse remote update or not // sparse remote update or not
optional bool sparse_remote_update = 16 [default = false]; optional bool sparse_remote_update = 16 [ default = false ];
// gradient clipping threshold, no clipping by default // gradient clipping threshold, no clipping by default
optional double gradient_clipping_threshold = 17 [default = 0.0]; optional double gradient_clipping_threshold = 17 [ default = 0.0 ];
// static parameters are fixed when training // static parameters are fixed when training
optional bool is_static = 18 [default = false]; optional bool is_static = 18 [ default = false ];
// para_id should NOT be set by config_parser. It is for // para_id should NOT be set by config_parser. It is for
// internal use. // internal use.
optional uint64 para_id = 19; optional uint64 para_id = 19;
repeated ParameterUpdaterHookConfig update_hooks = 20; repeated ParameterUpdaterHookConfig update_hooks = 20;
// setup load mat -> csr // setup load mat -> csr
optional bool need_compact = 21 [default = false]; optional bool need_compact = 21 [ default = false ];
// whether to do sparse update for this parameter // whether to do sparse update for this parameter
optional bool sparse_update = 22 [default = false]; optional bool sparse_update = 22 [ default = false ];
// whether this parameter is shared or not. // whether this parameter is shared or not.
optional bool is_shared = 23 [default = false]; optional bool is_shared = 23 [ default = false ];
// parameter block size // parameter block size
optional uint64 parameter_block_size = 24 [default = 0]; optional uint64 parameter_block_size = 24 [ default = 0 ];
} }
...@@ -15,13 +15,10 @@ syntax = "proto2"; ...@@ -15,13 +15,10 @@ syntax = "proto2";
package paddle; package paddle;
/** /**
* Configuration structure for ParameterClient2. * Configuration structure for ParameterClient2.
*/ */
message ParameterClientConfig { message ParameterClientConfig { required int32 trainer_id = 1; }
required int32 trainer_id = 1;
}
/** /**
* Configuration structure for ParameterServer2. * Configuration structure for ParameterServer2.
...@@ -30,24 +27,24 @@ message ParameterServerConfig { ...@@ -30,24 +27,24 @@ message ParameterServerConfig {
// Number of ports for sending dense parameter, // Number of ports for sending dense parameter,
// following ports on parameter server will be visited // following ports on parameter server will be visited
// for sending dense parameter: [port, port+ports_num-1] // for sending dense parameter: [port, port+ports_num-1]
required int32 ports_num = 1 [default = 1]; required int32 ports_num = 1 [ default = 1 ];
// Number of ports for sending sparse parameter, // Number of ports for sending sparse parameter,
// following ports on parameter server will be visited // following ports on parameter server will be visited
// for sending sparse parameter: // for sending sparse parameter:
// [port+ports_num, port+ports_num+ports_num_for_sparse-1] // [port+ports_num, port+ports_num+ports_num_for_sparse-1]
required int32 ports_num_for_sparse = 2 [default = 0]; required int32 ports_num_for_sparse = 2 [ default = 0 ];
// network device name for pservers // network device name for pservers
required string nics = 3 [default = "xgbe0,xgbe1"]; required string nics = 3 [ default = "xgbe0,xgbe1" ];
required string rdma_tcp = 4 [default = "tcp"]; required string rdma_tcp = 4 [ default = "tcp" ];
// Listening port for pserver // Listening port for pserver
required int32 port = 5 [default = 20134]; required int32 port = 5 [ default = 20134 ];
// number of gradient servers // number of gradient servers
required int32 num_gradient_servers = 6 [default = 1]; required int32 num_gradient_servers = 6 [ default = 1 ];
// number of threads for sync op exec // number of threads for sync op exec
required int32 pserver_num_threads = 7 [default = 1]; required int32 pserver_num_threads = 7 [ default = 1 ];
// control config_.async_lagged_grad_discard_ratio() min value // control config_.async_lagged_grad_discard_ratio() min value
required double async_lagged_ratio_min = 8 [default = 1.0]; required double async_lagged_ratio_min = 8 [ default = 1.0 ];
// if async_lagged_grad_discard_ratio is not set in trainer_config.conf // if async_lagged_grad_discard_ratio is not set in trainer_config.conf
// use it as defalut value // use it as defalut value
required double async_lagged_ratio_default = 9 [default = 1.5]; required double async_lagged_ratio_default = 9 [ default = 1.5 ];
} }
\ No newline at end of file
...@@ -23,8 +23,8 @@ package paddle; ...@@ -23,8 +23,8 @@ package paddle;
*/ */
enum ParameterUpdateMode { enum ParameterUpdateMode {
// Set parameter // Set parameter
PSERVER_UPDATE_MODE_SET_PARAM = 0;//use local param PSERVER_UPDATE_MODE_SET_PARAM = 0; // use local param
PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1;//set zero param PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1; // set zero param
// Update parameter once a gradient is received // Update parameter once a gradient is received
PSERVER_UPDATE_MODE_ASYNC_SGD = 2; PSERVER_UPDATE_MODE_ASYNC_SGD = 2;
...@@ -37,7 +37,7 @@ enum ParameterUpdateMode { ...@@ -37,7 +37,7 @@ enum ParameterUpdateMode {
// No update. Only get parameters back. // No update. Only get parameters back.
PSERVER_UPDATE_MODE_GET_PARAM = 5; PSERVER_UPDATE_MODE_GET_PARAM = 5;
PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6;//only get sparse rows PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6; // only get sparse rows
}; };
message ParameterBlock { message ParameterBlock {
...@@ -80,42 +80,34 @@ message SendParameterRequest { ...@@ -80,42 +80,34 @@ message SendParameterRequest {
optional int32 trainer_id = 7; optional int32 trainer_id = 7;
// send back parameter type on pserver, PARAMETER_VALUE by default // send back parameter type on pserver, PARAMETER_VALUE by default
optional int32 send_back_parameter_type = 8 [default = 0]; optional int32 send_back_parameter_type = 8 [ default = 0 ];
// forwardbackward time in usec // forwardbackward time in usec
optional uint64 forwardbackward_time = 9; optional uint64 forwardbackward_time = 9;
} }
message WaitPassStartRequest { message WaitPassStartRequest {}
}
message WaitPassStartResponse { message WaitPassStartResponse {}
}
message WaitPassFinishRequest { message WaitPassFinishRequest {}
}
message WaitPassFinishResponse { message WaitPassFinishResponse {}
}
enum SyncObject { enum SyncObject {
SYNC_DEFAULT = 0; // wait for the synchronizeBarrier_ SYNC_DEFAULT = 0; // wait for the synchronizeBarrier_
SYNC_DATA = 1; // wait for the synchronizeDataBarrier_ SYNC_DATA = 1; // wait for the synchronizeDataBarrier_
} }
message SynchronizeRequest { message SynchronizeRequest {
required SyncObject sync_object_id = 1 [default = SYNC_DEFAULT]; required SyncObject sync_object_id = 1 [ default = SYNC_DEFAULT ];
optional int32 trainer_id = 2; optional int32 trainer_id = 2;
} }
message SynchronizeResponse { message SynchronizeResponse {}
}
message SendParameterResponse { message SendParameterResponse { repeated ParameterBlock blocks = 1; }
repeated ParameterBlock blocks = 1;
}
message SetConfigRequest { message SetConfigRequest {
repeated ParameterConfig param_configs = 1; repeated ParameterConfig param_configs = 1;
...@@ -125,26 +117,18 @@ message SetConfigRequest { ...@@ -125,26 +117,18 @@ message SetConfigRequest {
required bool is_sparse_server = 6; required bool is_sparse_server = 6;
} }
message SetConfigResponse{ message SetConfigResponse {}
}
message GetStatusRequest { message GetStatusRequest {}
}
message GetStatusResponse { message GetStatusResponse { required PServerStatus status = 1; }
required PServerStatus status = 1;
}
message SetStatusRequest { message SetStatusRequest { required PServerStatus status = 1; }
required PServerStatus status = 1;
}
message SetStatusResponse { message SetStatusResponse {}
}
// create a column vector. The size is the dimension of parameter // create a column vector. The size is the dimension of parameter
message CreateVectorRequest { message CreateVectorRequest {}
}
message CreateVectorResponse { message CreateVectorResponse {
// error message. Empty if success // error message. Empty if success
...@@ -153,9 +137,7 @@ message CreateVectorResponse { ...@@ -153,9 +137,7 @@ message CreateVectorResponse {
required int64 handle = 2; required int64 handle = 2;
} }
message ReleaseVectorRequest { message ReleaseVectorRequest { required int64 handle = 1; }
required int64 handle = 1;
}
message ReleaseVectorResponse { message ReleaseVectorResponse {
// error message. Empty if success // error message. Empty if success
...@@ -164,9 +146,7 @@ message ReleaseVectorResponse { ...@@ -164,9 +146,7 @@ message ReleaseVectorResponse {
// Create a column major matrix. The number of rows is the dimension // Create a column major matrix. The number of rows is the dimension
// of parameter. The number of columns is specifed by num_cols // of parameter. The number of columns is specifed by num_cols
message CreateMatrixRequest { message CreateMatrixRequest { required int32 num_cols = 1; }
required int32 num_cols = 1;
}
message CreateMatrixResponse { message CreateMatrixResponse {
// error message. Empty if success // error message. Empty if success
...@@ -175,16 +155,13 @@ message CreateMatrixResponse { ...@@ -175,16 +155,13 @@ message CreateMatrixResponse {
required int64 handle = 2; required int64 handle = 2;
} }
message ReleaseMatrixRequest { message ReleaseMatrixRequest { required int64 handle = 1; }
required int64 handle = 1;
}
message ReleaseMatrixResponse { message ReleaseMatrixResponse {
// error message. Empty if success // error message. Empty if success
optional string return_message = 1; optional string return_message = 1;
} }
/** /**
* The operations are defined using the variables commented at Operation * The operations are defined using the variables commented at Operation
* and OperationResult * and OperationResult
...@@ -245,36 +222,36 @@ enum MatrixVectorOperation { ...@@ -245,36 +222,36 @@ enum MatrixVectorOperation {
message ProtoVector { message ProtoVector {
required int64 dim = 1; required int64 dim = 1;
repeated double values = 2 [packed = true]; repeated double values = 2 [ packed = true ];
} }
message ProtoMatrix { message ProtoMatrix {
required int64 num_rows = 1; required int64 num_rows = 1;
required int64 num_cols = 2; required int64 num_cols = 2;
repeated double values = 3 [packed = true]; repeated double values = 3 [ packed = true ];
} }
message Operation { message Operation {
required MatrixVectorOperation operation = 1; required MatrixVectorOperation operation = 1;
// vector handles created on the pserver // vector handles created on the pserver
repeated int64 pvectors = 2; // u, v, w repeated int64 pvectors = 2; // u, v, w
// matrix handles created on the pserver // matrix handles created on the pserver
repeated int64 pmatrices = 3; // A, B, C repeated int64 pmatrices = 3; // A, B, C
repeated double scalars = 4; // a, b, c repeated double scalars = 4; // a, b, c
repeated ProtoVector vectors = 5; // x, y, z repeated ProtoVector vectors = 5; // x, y, z
repeated ProtoMatrix matrices = 6; // X, Y, Z repeated ProtoMatrix matrices = 6; // X, Y, Z
} }
message OperationResult { message OperationResult {
// error message. Empty if success // error message. Empty if success
optional string return_message = 1; optional string return_message = 1;
// //
repeated double scalars = 2; // d, e, f repeated double scalars = 2; // d, e, f
repeated ProtoVector vectors = 3; // p, q, r repeated ProtoVector vectors = 3; // p, q, r
repeated ProtoMatrix matrices = 4; // P, Q, R repeated ProtoMatrix matrices = 4; // P, Q, R
} }
message DoOperationRequest { message DoOperationRequest {
...@@ -301,18 +278,14 @@ message DoOperationResponse { ...@@ -301,18 +278,14 @@ message DoOperationResponse {
required bool pass_finish = 3; required bool pass_finish = 3;
} }
message LoadValueRequest { message LoadValueRequest { required string dir_name = 1; }
required string dir_name = 1;
}
message LoadValueResponse { message LoadValueResponse {
// error message. Empty if success // error message. Empty if success
optional string return_message = 1; optional string return_message = 1;
} }
message SaveValueRequest { message SaveValueRequest { required string dir_name = 1; }
required string dir_name = 1;
}
message SaveValueResponse { message SaveValueResponse {
// error message. Empty if success // error message. Empty if success
...@@ -331,11 +304,11 @@ enum DataUpdateMode { ...@@ -331,11 +304,11 @@ enum DataUpdateMode {
// Client send it's own ref label to pserver // Client send it's own ref label to pserver
DATA_UPDATE_MODE_SET_REF_LABEL = 4; DATA_UPDATE_MODE_SET_REF_LABEL = 4;
// Client get all ref labels from all pservers // Client get all ref labels from all pservers
DATA_UPDATE_MODE_GET_REF_LABEL =5; DATA_UPDATE_MODE_GET_REF_LABEL = 5;
// Client send it's own ref grad to pserver // Client send it's own ref grad to pserver
DATA_UPDATE_MODE_SET_REF_GRAD =6; DATA_UPDATE_MODE_SET_REF_GRAD = 6;
// Client get all ref grad from all pservers // Client get all ref grad from all pservers
DATA_UPDATE_MODE_GET_REF_GRAD =7; DATA_UPDATE_MODE_GET_REF_GRAD = 7;
} }
enum SendDataType { enum SendDataType {
...@@ -360,7 +333,7 @@ message DataBlock { ...@@ -360,7 +333,7 @@ message DataBlock {
// byte size of one data type // byte size of one data type
required int32 data_size = 2; required int32 data_size = 2;
// data_type // data_type
optional TransDataType data_type = 3 [default = TRANS_DOUBLE]; optional TransDataType data_type = 3 [ default = TRANS_DOUBLE ];
} }
message SendDataRequest { message SendDataRequest {
......
...@@ -20,14 +20,14 @@ package paddle; ...@@ -20,14 +20,14 @@ package paddle;
message OptimizationConfig { message OptimizationConfig {
required int32 batch_size = 3; required int32 batch_size = 3;
required string algorithm = 4 [default = "async_sgd"]; required string algorithm = 4 [ default = "async_sgd" ];
optional int32 num_batches_per_send_parameter = 5 [default = 1]; optional int32 num_batches_per_send_parameter = 5 [ default = 1 ];
optional int32 num_batches_per_get_parameter = 6 [default = 1]; optional int32 num_batches_per_get_parameter = 6 [ default = 1 ];
required double learning_rate = 7; required double learning_rate = 7;
optional double learning_rate_decay_a = 8 [default = 0]; optional double learning_rate_decay_a = 8 [ default = 0 ];
optional double learning_rate_decay_b = 9 [default = 0]; optional double learning_rate_decay_b = 9 [ default = 0 ];
optional string learning_rate_schedule = 27 [default = "constant"]; optional string learning_rate_schedule = 27 [ default = "constant" ];
// learning rate will be scaled according to learning_rate_schedule // learning rate will be scaled according to learning_rate_schedule
// 1), constant: // 1), constant:
// lr = learning_rate // lr = learning_rate
...@@ -49,88 +49,92 @@ message OptimizationConfig { ...@@ -49,88 +49,92 @@ message OptimizationConfig {
// owlqn related // owlqn related
// L1-regularization // L1-regularization
optional double l1weight = 10 [default = 0.1]; optional double l1weight = 10 [ default = 0.1 ];
// L2-regularization // L2-regularization
optional double l2weight = 11 [default = 0]; optional double l2weight = 11 [ default = 0 ];
// "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step) // "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step)
// then accept the step // then accept the step
optional double c1 = 12 [default = 0.0001]; optional double c1 = 12 [ default = 0.0001 ];
// multiply the step with "backoff", when wolfe condition doesn't satisfy // multiply the step with "backoff", when wolfe condition doesn't satisfy
optional double backoff = 13 [default = 0.5]; optional double backoff = 13 [ default = 0.5 ];
// how many "s"s and "y"s are kept in owlqn // how many "s"s and "y"s are kept in owlqn
optional int32 owlqn_steps = 14 [default = 10]; optional int32 owlqn_steps = 14 [ default = 10 ];
// accept the step if encountered "max_backoff" times of "reduce the step" // accept the step if encountered "max_backoff" times of "reduce the step"
optional int32 max_backoff = 15 [default = 5]; optional int32 max_backoff = 15 [ default = 5 ];
// L2-regularization coefficient is reduced linearly from iteration 0 to // L2-regularization coefficient is reduced linearly from iteration 0 to
// "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter" // "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter"
// iterations. set "l2weight_zero_iter" to 0 to disable this strategy. // iterations. set "l2weight_zero_iter" to 0 to disable this strategy.
optional int32 l2weight_zero_iter = 17 [default = 0]; optional int32 l2weight_zero_iter = 17 [ default = 0 ];
// averaged sgd // averaged sgd
// About average_window * numBatchProcessed parameter are used // About average_window * numBatchProcessed parameter are used
// for average. To be accurate, between average_window * numBatchProcessed // for average. To be accurate, between average_window * numBatchProcessed
// and 2 * average_window * numBatchProcessed parameters are used for // and 2 * average_window * numBatchProcessed parameters are used for
// average. // average.
optional double average_window = 18 [default = 0]; optional double average_window = 18 [ default = 0 ];
optional int64 max_average_window = 19 [default = 0x7fffffffffffffff]; optional int64 max_average_window = 19 [ default = 0x7fffffffffffffff ];
////////////////////////// //////////////////////////
// Options Adaptive SGD // // Options Adaptive SGD //
////////////////////////// //////////////////////////
// learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta", "rmsprop" // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta",
// default learning method("momentum") use global decayed learning rate with momentum. // "rmsprop"
// default learning method("momentum") use global decayed learning rate with
// momentum.
// "adagrad", "adadelta" and "rmsprop" can set momentum too. // "adagrad", "adadelta" and "rmsprop" can set momentum too.
optional string learning_method = 23 [default = "momentum"]; optional string learning_method = 23 [ default = "momentum" ];
optional double ada_epsilon = 24 [default = 1e-6]; optional double ada_epsilon = 24 [ default = 1e-6 ];
optional double ada_rou = 26 [default = 0.95]; optional double ada_rou = 26 [ default = 0.95 ];
// Force to do average in cpu in order to save gpu memory usage // Force to do average in cpu in order to save gpu memory usage
optional bool do_average_in_cpu = 25 [default = false]; optional bool do_average_in_cpu = 25 [ default = false ];
// delta add rate in pserver, used while num_batches_per_send_parameter>1 // delta add rate in pserver, used while num_batches_per_send_parameter>1
// will be divided by #machines automatically. // will be divided by #machines automatically.
optional double delta_add_rate = 28 [default = 1.0]; optional double delta_add_rate = 28 [ default = 1.0 ];
// We split a large size into smaller mini-batches, whose sizes are // We split a large size into smaller mini-batches, whose sizes are
// determined by mini_batch_size. It only takes effect when there is // determined by mini_batch_size. It only takes effect when there is
// an ExternalMachine. // an ExternalMachine.
optional int32 mini_batch_size = 29 [default = 128]; optional int32 mini_batch_size = 29 [ default = 128 ];
// automatically set if any one of parameters set sparse remote update flag // automatically set if any one of parameters set sparse remote update flag
optional bool use_sparse_remote_updater = 30 [default = false]; optional bool use_sparse_remote_updater = 30 [ default = false ];
// how to update center parameter and feedback to local parameter, // how to update center parameter and feedback to local parameter,
// when use local sgd update in cluster training. // when use local sgd update in cluster training.
// A option is elastic_average, proposed by the paper: Deep learning with elastic averaging SGD. // A option is elastic_average, proposed by the paper: Deep learning with
// If use elastic_average method, every trainer node should sample from whole data sets. // elastic averaging SGD.
optional string center_parameter_update_method = 31 [default = "average"]; // If use elastic_average method, every trainer node should sample from whole
// data sets.
optional string center_parameter_update_method = 31 [ default = "average" ];
// shrink sparse parameter value // shrink sparse parameter value
// only works if parameter is remote sparse update and has L1 decay rate // only works if parameter is remote sparse update and has L1 decay rate
optional double shrink_parameter_value = 32 [default = 0]; optional double shrink_parameter_value = 32 [ default = 0 ];
//////////////////////////// ////////////////////////////
// Options Adam Optimizer // // Options Adam Optimizer //
//////////////////////////// ////////////////////////////
optional double adam_beta1 = 33 [default = 0.9]; optional double adam_beta1 = 33 [ default = 0.9 ];
optional double adam_beta2 = 34 [default = 0.999]; optional double adam_beta2 = 34 [ default = 0.999 ];
optional double adam_epsilon = 35 [default = 1e-8]; optional double adam_epsilon = 35 [ default = 1e-8 ];
// arguments for learning rate scheduler // arguments for learning rate scheduler
// Format: num1:rate1,num2:rate2,...,numK:rateK // Format: num1:rate1,num2:rate2,...,numK:rateK
// For learning_rate_schedule="manual", num is the number of samples, // For learning_rate_schedule="manual", num is the number of samples,
// For learning_rate_schedule="pass_manual", // For learning_rate_schedule="pass_manual",
// num is the number of passes (starting from 0) // num is the number of passes (starting from 0)
optional string learning_rate_args = 36 [default = ""]; optional string learning_rate_args = 36 [ default = "" ];
// for async sgd gradient commit control. // for async sgd gradient commit control.
// when async_lagged_grad_discard_ratio * num_gradient_servers commit passed, // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
// current async gradient will be discard silently. // current async gradient will be discard silently.
optional double async_lagged_grad_discard_ratio = 37 [default = 1.5]; optional double async_lagged_grad_discard_ratio = 37 [ default = 1.5 ];
// global threshold for gradient clipping // global threshold for gradient clipping
optional double gradient_clipping_threshold = 38 [default = 0.0]; optional double gradient_clipping_threshold = 38 [ default = 0.0 ];
}; };
message TrainerConfig { message TrainerConfig {
...@@ -141,7 +145,7 @@ message TrainerConfig { ...@@ -141,7 +145,7 @@ message TrainerConfig {
repeated string config_files = 5; repeated string config_files = 5;
// the directory to save/load model files for each training path // the directory to save/load model files for each training path
optional string save_dir = 6 [default = "./output/model"]; optional string save_dir = 6 [ default = "./output/model" ];
// Path of the initial model parameters. // Path of the initial model parameters.
// If it was set, start_pass will be ignored. // If it was set, start_pass will be ignored.
...@@ -149,7 +153,7 @@ message TrainerConfig { ...@@ -149,7 +153,7 @@ message TrainerConfig {
// Start training from this pass. // Start training from this pass.
// Will load parameter from the previous pass. // Will load parameter from the previous pass.
optional int32 start_pass = 8 [default = 0]; optional int32 start_pass = 8 [ default = 0 ];
// file path to the trainer config file // file path to the trainer config file
optional string config_file = 9; optional string config_file = 9;
......
...@@ -39,7 +39,7 @@ add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp ...@@ -39,7 +39,7 @@ add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
add_custom_target(paddle_python ALL DEPENDS add_custom_target(paddle_python ALL DEPENDS
${OUTPUT_DIR}/.timestamp) ${OUTPUT_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model python_api_wheel)
set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
......
...@@ -133,7 +133,7 @@ def convert(path): ...@@ -133,7 +133,7 @@ def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train100(), 10, "cifar_train100") paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
paddle.v2.dataset.common.convert(path, test100(), 10, "cifar_test100") paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
paddle.v2.dataset.common.convert(path, train10(), 10, "cifar_train10") paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
paddle.v2.dataset.common.convert(path, test10(), 10, "cifar_test10") paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
...@@ -32,17 +32,22 @@ __all__ = [ ...@@ -32,17 +32,22 @@ __all__ = [
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset') DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
# When running unit tests, there could be multiple processes that # When running unit tests, there could be multiple processes that
# trying to create DATA_HOME directory simultaneously, so we cannot # trying to create DATA_HOME directory simultaneously, so we cannot
# use a if condition to check for the existence of the directory; # use a if condition to check for the existence of the directory;
# instead, we use the filesystem as the synchronization mechanism by # instead, we use the filesystem as the synchronization mechanism by
# catching returned errors. # catching returned errors.
try: def must_mkdirs(path):
os.makedirs(DATA_HOME) try:
except OSError as exc: os.makedirs(DATA_HOME)
if exc.errno != errno.EEXIST: except OSError as exc:
raise if exc.errno != errno.EEXIST:
pass raise
pass
must_mkdirs(DATA_HOME)
def md5file(fname): def md5file(fname):
...@@ -93,6 +98,19 @@ def fetch_all(): ...@@ -93,6 +98,19 @@ def fetch_all():
"fetch")() "fetch")()
def fetch_all_recordio(path):
for module_name in filter(lambda x: not x.startswith("__"),
dir(paddle.v2.dataset)):
if "convert" in dir(
importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
not module_name == "common":
ds_path = os.path.join(path, module_name)
must_mkdirs(ds_path)
getattr(
importlib.import_module("paddle.v2.dataset.%s" % module_name),
"convert")(ds_path)
def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump): def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
""" """
you can call the function as: you can call the function as:
......
...@@ -233,5 +233,5 @@ def convert(path): ...@@ -233,5 +233,5 @@ def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, test(), 10, "conl105_train") paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train")
paddle.v2.dataset.common.convert(path, test(), 10, "conl105_test") paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test")
...@@ -173,5 +173,5 @@ def convert(path): ...@@ -173,5 +173,5 @@ def convert(path):
Converts dataset to recordio format Converts dataset to recordio format
""" """
w = word_dict() w = word_dict()
paddle.v2.dataset.common.convert(path, lambda: train(w), 10, "imdb_train") paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
paddle.v2.dataset.common.convert(path, lambda: test(w), 10, "imdb_test") paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
...@@ -155,6 +155,7 @@ def convert(path): ...@@ -155,6 +155,7 @@ def convert(path):
N = 5 N = 5
word_dict = build_dict() word_dict = build_dict()
paddle.v2.dataset.common.convert(path, paddle.v2.dataset.common.convert(path,
train(word_dict, N), 10, "imikolov_train") train(word_dict, N), 1000,
"imikolov_train")
paddle.v2.dataset.common.convert(path, paddle.v2.dataset.common.convert(path,
test(word_dict, N), 10, "imikolov_test") test(word_dict, N), 1000, "imikolov_test")
...@@ -119,5 +119,5 @@ def convert(path): ...@@ -119,5 +119,5 @@ def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train(), 10, "minist_train") paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train")
paddle.v2.dataset.common.convert(path, test(), 10, "minist_test") paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test")
...@@ -254,8 +254,8 @@ def convert(path): ...@@ -254,8 +254,8 @@ def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train(), 10, "movielens_train") paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train")
paddle.v2.dataset.common.convert(path, test(), 10, "movielens_test") paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -137,5 +137,5 @@ def convert(path): ...@@ -137,5 +137,5 @@ def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train, 10, "sentiment_train") paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train")
paddle.v2.dataset.common.convert(path, test, 10, "sentiment_test") paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test")
...@@ -119,5 +119,5 @@ def convert(path): ...@@ -119,5 +119,5 @@ def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train(), 10, "uci_housing_train") paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train")
paddle.v2.dataset.common.convert(path, test(), 10, "uci_houseing_test") paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
...@@ -169,5 +169,6 @@ def convert(path): ...@@ -169,5 +169,6 @@ def convert(path):
Converts dataset to recordio format Converts dataset to recordio format
""" """
dict_size = 30000 dict_size = 30000
paddle.v2.dataset.common.convert(path, train(dict_size), 10, "wmt14_train") paddle.v2.dataset.common.convert(path,
paddle.v2.dataset.common.convert(path, test(dict_size), 10, "wmt14_test") train(dict_size), 1000, "wmt14_train")
paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
import paddle.v2.framework.core as core
from paddle.v2.framework.create_op_creation_methods import op_creations
from default_scope_funcs import new_var, find_var, get_cur_scope
__all__ = ['Network'] # Only expose Network
class NetworkFunctor(object):
"""
Network Op Creation Function. Used internally in this module.
It convert string input to Variable. If it is not created before, just
create in scope.
It is a functor object. means the instances are callable.
:param func: The op creation function which generated in Python.
:param net: The Network instance.
"""
def __init__(self, func, net):
self.func = func
self.net = net
def __call__(self, *args, **kwargs):
if len(args) != 0:
raise ValueError("Paddle must use keyword argument")
inputs = self.func.all_input_args
for ipt in inputs:
if ipt in kwargs:
var = kwargs[ipt]
if isinstance(var, basestring):
tmp = new_var(var)
self.net.var_names[tmp] = var
var = tmp
if not isinstance(var, core.Variable):
raise TypeError(
"Input of op creation must be string or variable")
kwargs[ipt] = self.net.var_names[var]
notemp_outputs = self.func.all_not_temp_output_args
for name in notemp_outputs:
if name not in kwargs:
kwargs[
name] = self.func.__name__ + "@OUT@%d" % core.unique_integer(
)
outputs = self.func.all_output_args
for opt in outputs:
if opt in kwargs:
var = kwargs[opt]
if isinstance(var, basestring):
tmp = new_var(var)
self.net.var_names[tmp] = var
var = tmp
if not isinstance(var, core.Variable):
raise TypeError(
"Output of op creation must be string or variable")
kwargs[opt] = self.net.var_names[var]
op = self.func(**kwargs)
self.net.net.add_op(op)
lst = [find_var(kwargs[opt]) for opt in notemp_outputs]
if len(lst) == 1:
return lst[0]
elif len(lst) == 0:
return None
else:
return lst
class Network(object):
"""
The network concept. It avoid user to manually create operator, create
variable, and combine them into a Net. Just use Network.xxx can create the
operator, create variables in default scope, and add them into `self.net`.
For example:
.. code-block: python
net = Network()
out = net.add_two(X="a", Y="b")
fc_out = net.fc(X="out", W="fc.w")
net.run(...)
"""
def __init__(self):
self.net = core.Net.create()
funcs = (func_name for func_name in dir(op_creations)
if not func_name.startswith("__"))
self.var_names = dict()
# TODO(yuyang18): This code can work, but do not generate a good
# docstring, try to give a better way generate function in runtime
# later.
for func_name in funcs:
func = getattr(op_creations, func_name)
impl = NetworkFunctor(func, self)
setattr(self, func_name, impl.__call__)
self.__complete_add_op__ = False
def infer_shape(self):
self.complete_add_op()
self.net.infer_shape(get_cur_scope())
def run(self, device_context):
self.complete_add_op()
self.net.run(get_cur_scope(), device_context)
def __str__(self):
return str(self.net)
def complete_add_op(self):
if not self.__complete_add_op__:
self.net.complete_add_op()
self.__complete_add_op__ = True
if __name__ == '__main__':
net = Network()
out = net.add_two(X="a", Y="b")
fc_out = net.fc(X=out, W="fc.w", b="fc.b", activation="softmax")
net.complete_add_op()
print net
import paddle.v2.framework.core as core import paddle.v2.framework.core as core
import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2 import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2 import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2
import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2 import paddle.v2.framework.proto.attribute_pb2 as attribute_pb2
import cStringIO
def get_all_op_protos(): def get_all_op_protos():
...@@ -57,7 +56,7 @@ class OpDescCreationMethod(object): ...@@ -57,7 +56,7 @@ class OpDescCreationMethod(object):
op_desc.attrs.extend([out_format]) op_desc.attrs.extend([out_format])
if len(tmp_index) != 0: if len(tmp_index) != 0:
tmp_index_attr = op_desc.attrs.add() tmp_index_attr = op_desc.attrs.add()
tmp_index_attr.type = attr_type_pb2.INTS tmp_index_attr.type = attribute_pb2.INTS
tmp_index_attr.name = "temporary_index" tmp_index_attr.name = "temporary_index"
tmp_index_attr.ints.extend(tmp_index) tmp_index_attr.ints.extend(tmp_index)
...@@ -73,17 +72,17 @@ class OpDescCreationMethod(object): ...@@ -73,17 +72,17 @@ class OpDescCreationMethod(object):
new_attr = op_desc.attrs.add() new_attr = op_desc.attrs.add()
new_attr.name = attr.name new_attr.name = attr.name
new_attr.type = attr.type new_attr.type = attr.type
if attr.type == attr_type_pb2.INT: if attr.type == attribute_pb2.INT:
new_attr.i = user_defined_attr new_attr.i = user_defined_attr
elif attr.type == attr_type_pb2.FLOAT: elif attr.type == attribute_pb2.FLOAT:
new_attr.f = user_defined_attr new_attr.f = user_defined_attr
elif attr.type == attr_type_pb2.STRING: elif attr.type == attribute_pb2.STRING:
new_attr.s = user_defined_attr new_attr.s = user_defined_attr
elif attr.type == attr_type_pb2.INTS: elif attr.type == attribute_pb2.INTS:
new_attr.ints.extend(user_defined_attr) new_attr.ints.extend(user_defined_attr)
elif attr.type == attr_type_pb2.FLOATS: elif attr.type == attribute_pb2.FLOATS:
new_attr.floats.extend(user_defined_attr) new_attr.floats.extend(user_defined_attr)
elif attr.type == attr_type_pb2.STRINGS: elif attr.type == attribute_pb2.STRINGS:
new_attr.strings.extend(user_defined_attr) new_attr.strings.extend(user_defined_attr)
else: else:
raise NotImplementedError("Not support attribute type " + raise NotImplementedError("Not support attribute type " +
...@@ -109,7 +108,7 @@ class OpDescCreationMethod(object): ...@@ -109,7 +108,7 @@ class OpDescCreationMethod(object):
retv = [] retv = []
if multiple: if multiple:
var_format = op_desc_pb2.AttrDesc() var_format = op_desc_pb2.AttrDesc()
var_format.type = attr_type_pb2.INTS var_format.type = attribute_pb2.INTS
var_format.name = "%s_format" % in_out var_format.name = "%s_format" % in_out
var_format.ints.append(0) var_format.ints.append(0)
...@@ -146,64 +145,14 @@ class OpDescCreationMethod(object): ...@@ -146,64 +145,14 @@ class OpDescCreationMethod(object):
return False return False
def get_docstring_from_op_proto(op_proto): class OpInfo(object):
""" def __init__(self, name, method, inputs, outputs, attrs, no_temp_outputs):
Generate docstring from a OpProto self.name = name
:param op_proto: a OpProto instance. self.method = method
:type op_proto: op_proto_pb2.OpProto self.inputs = inputs
:return: docstring self.outputs = outputs
""" self.attrs = attrs
if not isinstance(op_proto, op_proto_pb2.OpProto): self.no_temp_outputs = no_temp_outputs
raise TypeError("Input must be OpProto")
f = cStringIO.StringIO()
f.write(op_proto.comment)
f.write("\n")
def __append_param__(name, comment, type):
# Maybe replace the following line with template engine is better.
f.write(":param ")
f.write(name)
f.write(": ")
f.write(comment)
f.write("\n")
f.write(":type ")
f.write(name)
f.write(": ")
f.write(type)
f.write("\n")
for ipt in op_proto.inputs:
__append_param__(ipt.name, ipt.comment, "list | basestr"
if ipt.multiple else "basestr")
temp_var_prefix = \
"This is a temporary variable. It does not have to set by user. "
for opt in op_proto.outputs:
__append_param__(opt.name, opt.comment if not opt.temporary else
temp_var_prefix + opt.comment, "list | basestr"
if opt.multiple else "basestr")
for attr in op_proto.attrs:
attr_type = None
if attr.type == attr_type_pb2.INT:
attr_type = "int"
elif attr.type == attr_type_pb2.FLOAT:
attr_type = "float"
elif attr.type == attr_type_pb2.STRING:
attr_type = "basestr"
elif attr.type == attr_type_pb2.INTS:
attr_type = "list of int"
elif attr.type == attr_type_pb2.FLOATS:
attr_type = "list of float"
elif attr.type == attr_type_pb2.STRINGS:
attr_type = "list of basestr"
if attr_type is None:
raise RuntimeError("Not supported attribute type " + attr.type)
__append_param__(attr.name, attr.comment, attr_type)
return f.getvalue()
def create_op_creation_method(op_proto): def create_op_creation_method(op_proto):
...@@ -216,38 +165,57 @@ def create_op_creation_method(op_proto): ...@@ -216,38 +165,57 @@ def create_op_creation_method(op_proto):
opdesc = method(*args, **kwargs) opdesc = method(*args, **kwargs)
return core.Operator.create(opdesc.SerializeToString()) return core.Operator.create(opdesc.SerializeToString())
__impl__.__doc__ = get_docstring_from_op_proto(op_proto) return OpInfo(
__impl__.all_input_args = [var.name for var in op_proto.inputs] method=__impl__,
__impl__.all_output_args = [var.name for var in op_proto.outputs] name=op_proto.type,
__impl__.all_attr_args = [attr.name for attr in op_proto.attrs] inputs=[var.name for var in op_proto.inputs],
__impl__.all_not_temp_output_args = [ outputs=[var.name for var in op_proto.outputs],
var.name for var in op_proto.outputs if not var.temporary attrs=[attr.name for attr in op_proto.attrs],
] no_temp_outputs=[
var.name for var in op_proto.outputs if not var.temporary
])
return __impl__
class OperatorFactory(object):
def __init__(self):
self.op_methods = dict()
for op_proto in get_all_op_protos():
method = create_op_creation_method(op_proto)
self.op_methods[method.name] = method
class OpCreationsHolder(object): def __call__(self, *args, **kwargs):
""" if 'type' in kwargs:
A object will holds all op creation methods. if len(args) != 0:
raise ValueError("All Paddle argument should be key-word "
Use `op_creations.xxx_op` to access them. "argument except type")
""" t = kwargs.pop('type')
pass else:
if len(args) != 1:
raise ValueError("All Paddle argument should be key-word "
"argument except type")
t = args[0]
return self.get_op_info(t).method(**kwargs)
op_creations = OpCreationsHolder() def types(self):
return self.op_methods.keys()
def get_op_info(self, t):
if t not in self.op_methods:
raise ValueError("operator %s is not registered", t)
return self.op_methods.get(t)
def __bootstrap__(): def get_op_input_names(self, type):
""" return self.get_op_info(type).inputs
Bootstrap function for this module. It will dynamic create all op creation
methods in runtime. def get_op_output_names(self, type):
""" return self.get_op_info(type).outputs
for op_proto in get_all_op_protos():
func = create_op_creation_method(op_proto) def get_op_attr_names(self, type):
func.__name__ = str(op_proto.type) return self.get_op_info(type).attrs
setattr(op_creations, func.__name__, func)
def get_op_no_temp_output_names(self, type):
return self.get_op_info(type).no_temp_outputs
__bootstrap__() Operator = OperatorFactory() # Default global factory
add_python_test(test_framework py_test(test_net SRCS test_net.py)
test_protobuf.py
test_scope.py py_test(test_fc_op SRCS test_fc_op.py)
test_default_scope_funcs.py py_test(test_scope SRCS test_scope.py)
test_op_creation_methods.py
test_net.py py_test(test_tensor SRCS test_tensor.py)
test_tensor.py py_test(test_mul_op SRCS test_mul_op.py)
test_fc_op.py
test_add_two_op.py py_test(test_mean_op SRCS test_mean_op.py)
test_sgd_op.py
test_mul_op.py py_test(test_protobuf SRCS test_protobuf.py)
test_mean_op.py
test_sigmoid_op.py py_test(test_add_two_op SRCS test_add_two_op.py)
test_softmax_op.py py_test(test_sigmoid_op SRCS test_sigmoid_op.py)
test_rowwise_add_op.py py_test(test_softmax_op SRCS test_softmax_op.py)
test_fill_zeros_like_op.py py_test(test_fill_zeros_like_op SRCS test_fill_zeros_like_op.py)
test_network.py
gradient_checker.py) py_test(gradient_checker SRCS gradient_checker.py)
py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py)
py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py)
py_test(test_operator SRCS test_operator.py)
import paddle.v2.framework.core as core import paddle.v2.framework.core as core
from paddle.v2.framework.create_op_creation_methods import op_creations from paddle.v2.framework.op import Operator
import numpy import numpy
import unittest import unittest
...@@ -80,7 +80,7 @@ if __name__ == '__main__': ...@@ -80,7 +80,7 @@ if __name__ == '__main__':
class GetNumericGradientTest(unittest.TestCase): class GetNumericGradientTest(unittest.TestCase):
def test_add_op(self): def test_add_op(self):
add_op = op_creations.add_two(X="X", Y="Y", Out="Z") add_op = Operator('add_two', X="X", Y="Y", Out="Z")
x = numpy.random.random((10, 1)).astype("float32") x = numpy.random.random((10, 1)).astype("float32")
y = numpy.random.random((10, 1)).astype("float32") y = numpy.random.random((10, 1)).astype("float32")
......
import paddle.v2.framework.core as core import paddle.v2.framework.core as core
import unittest import unittest
import numpy import numpy
import paddle.v2.framework.create_op_creation_methods as creation from paddle.v2.framework.op import Operator
class OpTestMeta(type): class OpTestMeta(type):
...@@ -21,50 +21,50 @@ class OpTestMeta(type): ...@@ -21,50 +21,50 @@ class OpTestMeta(type):
obj = super(OpTestMeta, cls).__new__(cls, name, bases, attrs) obj = super(OpTestMeta, cls).__new__(cls, name, bases, attrs)
def test_all(self): def test_all(self):
func = getattr(creation.op_creations, self.type, None)
self.assertIsNotNone(func)
scope = core.Scope() scope = core.Scope()
kwargs = dict() kwargs = dict()
places = [] places = [core.CPUPlace()]
places.append(core.CPUPlace()) if core.is_compile_gpu() and core.Operator.support_gpu(self.type):
if core.is_compile_gpu():
places.append(core.GPUPlace(0)) places.append(core.GPUPlace(0))
for place in places: for place in places:
for in_name in func.all_input_args: for in_name in Operator.get_op_input_names(self.type):
if hasattr(self, in_name): if hasattr(self, "inputs") and in_name in self.inputs:
kwargs[in_name] = in_name kwargs[in_name] = in_name
var = scope.new_var(in_name).get_tensor() var = scope.new_var(in_name).get_tensor()
arr = getattr(self, in_name) arr = self.inputs[in_name]
var.set_dims(arr.shape) var.set_dims(arr.shape)
var.set(arr, place) var.set(arr, place)
else: else:
kwargs[in_name] = "@EMPTY@" kwargs[in_name] = "@EMPTY@"
for out_name in func.all_output_args: for out_name in Operator.get_op_output_names(self.type):
if hasattr(self, out_name): if not hasattr(self, "outputs"):
kwargs[out_name] = out_name raise ValueError(
scope.new_var(out_name).get_tensor() "The test op must set self.outputs dict.")
if out_name not in self.outputs:
raise ValueError("The %s is not in self.outputs dict." %
(out_name))
kwargs[out_name] = out_name
scope.new_var(out_name).get_tensor()
for attr_name in func.all_attr_args: for attr_name in Operator.get_op_attr_names(self.type):
if hasattr(self, attr_name): if hasattr(self, "attrs") and attr_name in self.attrs:
kwargs[attr_name] = getattr(self, attr_name) kwargs[attr_name] = self.attrs[attr_name]
op = func(**kwargs) op = Operator(self.type, **kwargs)
op.infer_shape(scope) op.infer_shape(scope)
ctx = core.DeviceContext.create(place) ctx = core.DeviceContext.create(place)
op.run(scope, ctx) op.run(scope, ctx)
for out_name in func.all_output_args: for out_name in Operator.get_op_output_names(self.type):
actual = numpy.array(scope.find_var(out_name).get_tensor()) actual = numpy.array(scope.find_var(out_name).get_tensor())
expect = getattr(self, out_name) expect = self.outputs[out_name]
# TODO(qijun) The default decimal is 7, but numpy.dot and eigen.mul self.assertTrue(
# has some diff, and could not pass unittest. So I set decimal 3 here. numpy.allclose(actual, expect),
# And I will check this in future. "output name: " + out_name + "has diff")
numpy.testing.assert_almost_equal(actual, expect, decimal=3)
obj.test_all = test_all obj.test_all = test_all
return obj return obj
...@@ -2,7 +2,7 @@ import unittest ...@@ -2,7 +2,7 @@ import unittest
import numpy import numpy
import paddle.v2.framework.core as core import paddle.v2.framework.core as core
import paddle.v2.framework.create_op_creation_methods as creation from paddle.v2.framework.op import Operator
from op_test_util import OpTestMeta from op_test_util import OpTestMeta
...@@ -12,14 +12,16 @@ class TestAddOp(unittest.TestCase): ...@@ -12,14 +12,16 @@ class TestAddOp(unittest.TestCase):
def setUp(self): def setUp(self):
self.type = "add_two" self.type = "add_two"
self.X = numpy.random.random((102, 105)).astype("float32") self.inputs = {
self.Y = numpy.random.random((102, 105)).astype("float32") 'X': numpy.random.random((102, 105)).astype("float32"),
self.Out = self.X + self.Y 'Y': numpy.random.random((102, 105)).astype("float32")
}
self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
class TestAddGradOp(unittest.TestCase): class TestAddGradOp(unittest.TestCase):
def test_add_grad(self): def test_add_grad(self):
op = creation.op_creations.add_two(X="X", Y="Y", Out="Out") op = Operator('add_two', X="X", Y="Y", Out="Out")
backward_op = core.Operator.backward(op, set()) backward_op = core.Operator.backward(op, set())
self.assertEqual(backward_op.type(), "add_two_grad") self.assertEqual(backward_op.type(), "add_two_grad")
expected = '''Op(add_two_grad), inputs:(X, Y, Out, Out@GRAD), outputs:(X@GRAD, Y@GRAD).''' expected = '''Op(add_two_grad), inputs:(X, Y, Out, Out@GRAD), outputs:(X@GRAD, Y@GRAD).'''
......
...@@ -7,16 +7,20 @@ class TestSGD(unittest.TestCase): ...@@ -7,16 +7,20 @@ class TestSGD(unittest.TestCase):
__metaclass__ = OpTestMeta __metaclass__ = OpTestMeta
def setUp(self): def setUp(self):
# TODO this unit test is not passed
self.type = "onehot_cross_entropy" self.type = "onehot_cross_entropy"
batch_size = 100 batch_size = 100
class_num = 10 class_num = 10
self.X = numpy.random.random((batch_size, class_num)).astype("float32") X = numpy.random.random((batch_size, class_num)).astype("float32")
self.label = 5 * numpy.ones(batch_size).astype("int32") label = 5 * numpy.ones(batch_size).astype("int32")
self.inputs = {'X': X, 'label': label}
Y = [] Y = []
for i in range(0, batch_size): for i in range(0, batch_size):
Y.append(-numpy.log(self.X[i][self.label[i]])) Y.append(-numpy.log(X[i][label[i]]))
self.Y = numpy.array(Y).astype("float32") self.outputs = {'Y': numpy.array(Y).astype("float32")}
# TODO(superjom) add gradient check
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
import paddle.v2.framework.core as core import paddle.v2.framework.core as core
import unittest import unittest
import numpy import numpy
import paddle.v2.framework.create_op_creation_methods as creation from paddle.v2.framework.op import Operator
class TestFc(unittest.TestCase): class TestFc(unittest.TestCase):
...@@ -24,7 +24,7 @@ class TestFc(unittest.TestCase): ...@@ -24,7 +24,7 @@ class TestFc(unittest.TestCase):
# Set a real numpy array here. # Set a real numpy array here.
# x_tensor.set(numpy.array([])) # x_tensor.set(numpy.array([]))
op = creation.op_creations.fc(X="X", Y="Y", W="W") op = Operator("fc", X="X", Y="Y", W="W")
for out in op.outputs(): for out in op.outputs():
if scope.find_var(out) is None: if scope.find_var(out) is None:
......
...@@ -8,8 +8,8 @@ class TestMeanOp(unittest.TestCase): ...@@ -8,8 +8,8 @@ class TestMeanOp(unittest.TestCase):
def setUp(self): def setUp(self):
self.type = "mean" self.type = "mean"
self.X = np.random.random((32, 784)).astype("float32") self.inputs = {'X': np.random.random((32, 784)).astype("float32")}
self.Out = np.mean(self.X) self.outputs = {'Out': np.mean(self.inputs['X'])}
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -8,9 +8,11 @@ class TestMulOp(unittest.TestCase): ...@@ -8,9 +8,11 @@ class TestMulOp(unittest.TestCase):
def setUp(self): def setUp(self):
self.type = "mul" self.type = "mul"
self.X = np.random.random((32, 84)).astype("float32") self.inputs = {
self.Y = np.random.random((84, 100)).astype("float32") 'X': np.random.random((32, 84)).astype("float32"),
self.Out = np.dot(self.X, self.Y) 'Y': np.random.random((84, 100)).astype("float32")
}
self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
if __name__ == '__main__': if __name__ == '__main__':
......
import paddle.v2.framework.core as core import paddle.v2.framework.core as core
from paddle.v2.framework.create_op_creation_methods import op_creations from paddle.v2.framework.op import Operator
import unittest import unittest
class TestNet(unittest.TestCase): class TestNet(unittest.TestCase):
def test_net_all(self): def test_net_all(self):
net = core.Net.create() net = core.Net.create()
op1 = op_creations.add_two(X="X", Y="Y", Out="Out") op1 = Operator("add_two", X="X", Y="Y", Out="Out")
net.add_op(op1) net.add_op(op1)
net2 = core.Net.create() net2 = core.Net.create()
net2.add_op(op_creations.fc(X="X", W="w", Y="fc.out")) net2.add_op(Operator("fc", X="X", W="w", Y="fc.out"))
net2.complete_add_op(True) net2.complete_add_op(True)
net.add_op(net2) net.add_op(net2)
net.complete_add_op(True) net.complete_add_op(True)
......
from paddle.v2.framework.network import Network
import paddle.v2.framework.core as core
import unittest
class TestNet(unittest.TestCase):
def test_net_all(self):
net = Network()
out = net.add_two(X="X", Y="Y")
fc_out = net.fc(X=out, W="w")
net.complete_add_op()
self.assertTrue(isinstance(fc_out, core.Variable))
self.assertEqual(
'''Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, add_two@OUT@0, fc@OUT@1).
Op(add_two), inputs:(X, Y), outputs:(add_two@OUT@0).
Op(fc), inputs:(add_two@OUT@0, w, @EMPTY@), outputs:(fc@OUT@1, @TEMP@fc@0).
Op(mul), inputs:(add_two@OUT@0, w), outputs:(@TEMP@fc@0).
Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc@OUT@1).
''', str(net))
net2 = Network()
tmp = net2.add_two(X="X", Y="Y")
self.assertTrue(isinstance(tmp, core.Variable))
net2.complete_add_op()
self.assertEqual(
'''Op(plain_net), inputs:(X, Y), outputs:(add_two@OUT@2).
Op(add_two), inputs:(X, Y), outputs:(add_two@OUT@2).
''', str(net2))
if __name__ == '__main__':
unittest.main()
import unittest import unittest
import paddle.v2.framework.create_op_creation_methods as creation import paddle.v2.framework.op as op
import paddle.v2.framework.core as core import paddle.v2.framework.core as core
import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2 import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2 import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2
import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2 import paddle.v2.framework.proto.attribute_pb2 as attribute_pb2
class TestGetAllProtos(unittest.TestCase): class TestGetAllProtos(unittest.TestCase):
def test_all(self): def test_all(self):
all_protos = creation.get_all_op_protos() all_protos = op.get_all_op_protos()
self.assertNotEqual(0, len(all_protos)) self.assertNotEqual(0, len(all_protos))
for each in all_protos: for each in all_protos:
...@@ -17,25 +17,25 @@ class TestGetAllProtos(unittest.TestCase): ...@@ -17,25 +17,25 @@ class TestGetAllProtos(unittest.TestCase):
class TestOpDescCreationMethod(unittest.TestCase): class TestOpDescCreationMethod(unittest.TestCase):
def test_plain_input_output(self): def test_plain_input_output(self):
op = op_proto_pb2.OpProto() op_proto = op_proto_pb2.OpProto()
op.type = "test" op_proto.type = "test"
ipt = op.inputs.add() ipt = op_proto.inputs.add()
ipt.name = "X" ipt.name = "X"
ipt.comment = "not matter" ipt.comment = "not matter"
ipt = op.inputs.add() ipt = op_proto.inputs.add()
ipt.name = "Y" ipt.name = "Y"
ipt.comment = "not matter" ipt.comment = "not matter"
opt = op.outputs.add() opt = op_proto.outputs.add()
opt.name = "Z" opt.name = "Z"
opt.comment = "not matter" opt.comment = "not matter"
op.comment = "not matter" op_proto.comment = "not matter"
self.assertTrue(op.IsInitialized()) self.assertTrue(op_proto.IsInitialized())
method = creation.OpDescCreationMethod(op) method = op.OpDescCreationMethod(op_proto)
output = method(X="a", Y="b", Z="c") output = method(X="a", Y="b", Z="c")
expected = op_desc_pb2.OpDesc() expected = op_desc_pb2.OpDesc()
...@@ -45,29 +45,29 @@ class TestOpDescCreationMethod(unittest.TestCase): ...@@ -45,29 +45,29 @@ class TestOpDescCreationMethod(unittest.TestCase):
self.assertEqual(expected, output) self.assertEqual(expected, output)
def test_multiple_input_plain_output(self): def test_multiple_input_plain_output(self):
op = op_proto_pb2.OpProto() op_proto = op_proto_pb2.OpProto()
op.type = "fc" op_proto.type = "fc"
ipt = op.inputs.add() ipt = op_proto.inputs.add()
ipt.name = "X" ipt.name = "X"
ipt.comment = "" ipt.comment = ""
ipt.multiple = True ipt.multiple = True
ipt = op.inputs.add() ipt = op_proto.inputs.add()
ipt.name = "W" ipt.name = "W"
ipt.comment = "" ipt.comment = ""
ipt.multiple = True ipt.multiple = True
ipt = op.inputs.add() ipt = op_proto.inputs.add()
ipt.name = "b" ipt.name = "b"
ipt.comment = "" ipt.comment = ""
out = op.outputs.add() out = op_proto.outputs.add()
out.name = "Y" out.name = "Y"
out.comment = "" out.comment = ""
op.comment = "" op_proto.comment = ""
self.assertTrue(op.IsInitialized()) self.assertTrue(op_proto.IsInitialized())
method = creation.OpDescCreationMethod(op) method = op.OpDescCreationMethod(op_proto)
generated1 = method(X="x", W="w", b="b", Y="y") generated1 = method(X="x", W="w", b="b", Y="y")
expected1 = op_desc_pb2.OpDesc() expected1 = op_desc_pb2.OpDesc()
...@@ -76,7 +76,7 @@ class TestOpDescCreationMethod(unittest.TestCase): ...@@ -76,7 +76,7 @@ class TestOpDescCreationMethod(unittest.TestCase):
expected1.type = 'fc' expected1.type = 'fc'
attr = expected1.attrs.add() attr = expected1.attrs.add()
attr.name = 'input_format' attr.name = 'input_format'
attr.type = attr_type_pb2.INTS attr.type = attribute_pb2.INTS
attr.ints.extend([0, 1, 2, 3]) attr.ints.extend([0, 1, 2, 3])
self.assertEqual(expected1, generated1) self.assertEqual(expected1, generated1)
...@@ -88,34 +88,34 @@ class TestOpDescCreationMethod(unittest.TestCase): ...@@ -88,34 +88,34 @@ class TestOpDescCreationMethod(unittest.TestCase):
expected2.type = 'fc' expected2.type = 'fc'
attr = expected2.attrs.add() attr = expected2.attrs.add()
attr.name = 'input_format' attr.name = 'input_format'
attr.type = attr_type_pb2.INTS attr.type = attribute_pb2.INTS
attr.ints.extend([0, 3, 6, 7]) attr.ints.extend([0, 3, 6, 7])
self.assertEqual(expected2, generated2) self.assertEqual(expected2, generated2)
def test_attrs(self): def test_attrs(self):
op = op_proto_pb2.OpProto() op_proto = op_proto_pb2.OpProto()
op.type = "test" op_proto.type = "test"
ipt = op.inputs.add() ipt = op_proto.inputs.add()
ipt.name = 'X' ipt.name = 'X'
ipt.comment = "" ipt.comment = ""
def __add_attr__(name, type): def __add_attr__(name, type):
attr = op.attrs.add() attr = op_proto.attrs.add()
attr.name = name attr.name = name
attr.comment = "" attr.comment = ""
attr.type = type attr.type = type
__add_attr__("int_attr", attr_type_pb2.INT) __add_attr__("int_attr", attribute_pb2.INT)
__add_attr__("float_attr", attr_type_pb2.FLOAT) __add_attr__("float_attr", attribute_pb2.FLOAT)
__add_attr__("string_attr", attr_type_pb2.STRING) __add_attr__("string_attr", attribute_pb2.STRING)
__add_attr__("ints_attr", attr_type_pb2.INTS) __add_attr__("ints_attr", attribute_pb2.INTS)
__add_attr__("floats_attr", attr_type_pb2.FLOATS) __add_attr__("floats_attr", attribute_pb2.FLOATS)
__add_attr__("strings_attr", attr_type_pb2.STRINGS) __add_attr__("strings_attr", attribute_pb2.STRINGS)
op.comment = "" op_proto.comment = ""
self.assertTrue(op.IsInitialized()) self.assertTrue(op_proto.IsInitialized())
method = creation.OpDescCreationMethod(op) method = op.OpDescCreationMethod(op_proto)
generated = method( generated = method(
X="a", X="a",
...@@ -131,119 +131,68 @@ class TestOpDescCreationMethod(unittest.TestCase): ...@@ -131,119 +131,68 @@ class TestOpDescCreationMethod(unittest.TestCase):
expected.inputs.extend(['a']) expected.inputs.extend(['a'])
attr = expected.attrs.add() attr = expected.attrs.add()
attr.name = "int_attr" attr.name = "int_attr"
attr.type = attr_type_pb2.INT attr.type = attribute_pb2.INT
attr.i = 10 attr.i = 10
attr = expected.attrs.add() attr = expected.attrs.add()
attr.name = "float_attr" attr.name = "float_attr"
attr.type = attr_type_pb2.FLOAT attr.type = attribute_pb2.FLOAT
attr.f = 3.2 attr.f = 3.2
attr = expected.attrs.add() attr = expected.attrs.add()
attr.name = "string_attr" attr.name = "string_attr"
attr.type = attr_type_pb2.STRING attr.type = attribute_pb2.STRING
attr.s = "test_str" attr.s = "test_str"
attr = expected.attrs.add() attr = expected.attrs.add()
attr.name = "ints_attr" attr.name = "ints_attr"
attr.type = attr_type_pb2.INTS attr.type = attribute_pb2.INTS
attr.ints.extend([0, 1, 2, 3, 4]) attr.ints.extend([0, 1, 2, 3, 4])
attr = expected.attrs.add() attr = expected.attrs.add()
attr.name = 'floats_attr' attr.name = 'floats_attr'
attr.type = attr_type_pb2.FLOATS attr.type = attribute_pb2.FLOATS
attr.floats.extend([0.2, 3.2, 4.5]) attr.floats.extend([0.2, 3.2, 4.5])
attr = expected.attrs.add() attr = expected.attrs.add()
attr.name = 'strings_attr' attr.name = 'strings_attr'
attr.type = attr_type_pb2.STRINGS attr.type = attribute_pb2.STRINGS
attr.strings.extend(['a', 'b', 'c']) attr.strings.extend(['a', 'b', 'c'])
self.assertEqual(expected, generated) self.assertEqual(expected, generated)
def test_input_temporary_output(self): def test_input_temporary_output(self):
op = op_proto_pb2.OpProto() op_proto = op_proto_pb2.OpProto()
op.type = "test" op_proto.type = "test"
out = op.outputs.add() out = op_proto.outputs.add()
out.name = "OUT" out.name = "OUT"
out.comment = "" out.comment = ""
out = op.outputs.add() out = op_proto.outputs.add()
out.name = "TMP" out.name = "TMP"
out.comment = "" out.comment = ""
out.temporary = True out.temporary = True
out = op.outputs.add() out = op_proto.outputs.add()
out.name = "OUT2" out.name = "OUT2"
out.comment = "" out.comment = ""
op.comment = "" op_proto.comment = ""
method = creation.OpDescCreationMethod(op) method = op.OpDescCreationMethod(op_proto)
generated = method(OUT="a", OUT2="b") generated = method(OUT="a", OUT2="b")
desc = op_desc_pb2.OpDesc() desc = op_desc_pb2.OpDesc()
desc.outputs.extend(["a", core.var_names.temp(), "b"]) desc.outputs.extend(["a", core.var_names.temp(), "b"])
desc.type = "test" desc.type = "test"
attr = desc.attrs.add() attr = desc.attrs.add()
attr.name = "temporary_index" attr.name = "temporary_index"
attr.type = attr_type_pb2.INTS attr.type = attribute_pb2.INTS
attr.ints.append(2) attr.ints.append(2)
self.assertEqual(generated, desc) self.assertEqual(generated, desc)
class TestOpCreationDocStr(unittest.TestCase):
def test_all(self):
op = op_proto_pb2.OpProto()
op.type = "test"
op.comment = """Test Op.
This op is used for unit test, not a real op.
"""
a = op.inputs.add()
a.name = "a"
a.comment = "Input a for test op"
a.multiple = True
b = op.inputs.add()
b.name = "b"
b.comment = "Input b for test op"
self.assertTrue(op.IsInitialized())
o1 = op.outputs.add()
o1.name = "output"
o1.comment = "The output of test op"
o2 = op.outputs.add()
o2.name = "temp output"
o2.comment = "The temporary output of test op"
o2.temporary = True
test_str = op.attrs.add()
test_str.name = "str_attr"
test_str.type = attr_type_pb2.STRING
test_str.comment = "A string attribute for test op"
actual = creation.get_docstring_from_op_proto(op)
expected_docstring = '''Test Op.
This op is used for unit test, not a real op.
:param a: Input a for test op
:type a: list | basestr
:param b: Input b for test op
:type b: basestr
:param output: The output of test op
:type output: basestr
:param temp output: This is a temporary variable. It does not have to set by user. The temporary output of test op
:type temp output: basestr
:param str_attr: A string attribute for test op
:type str_attr: basestr
'''
self.assertEqual(expected_docstring, actual)
class TestOpCreations(unittest.TestCase): class TestOpCreations(unittest.TestCase):
def test_all(self): def test_all(self):
add_op = creation.op_creations.add_two(X="a", Y="b", Out="z") add_op = op.Operator("add_two", X="a", Y="b", Out="z")
self.assertIsNotNone(add_op) self.assertIsNotNone(add_op)
# Invoke C++ DebugString() # Invoke C++ DebugString()
self.assertEqual('Op(add_two), inputs:(a, b), outputs:(z).', self.assertEqual('Op(add_two), inputs:(a, b), outputs:(z).',
......
import paddle.v2.framework.proto.op_proto_pb2 import paddle.v2.framework.proto.op_proto_pb2 as op_proto_lib
import paddle.v2.framework.proto.attr_type_pb2 import paddle.v2.framework.proto.attribute_pb2 as attr_type_lib
import unittest import unittest
class TestFrameworkProto(unittest.TestCase): class TestFrameworkProto(unittest.TestCase):
def test_all(self): def test_all(self):
op_proto_lib = paddle.v2.framework.proto.op_proto_pb2
attr_type_lib = paddle.v2.framework.proto.attr_type_pb2
op_proto = op_proto_lib.OpProto() op_proto = op_proto_lib.OpProto()
ipt0 = op_proto.inputs.add() ipt0 = op_proto.inputs.add()
ipt0.name = "a" ipt0.name = "a"
......
import logging
import paddle.v2.framework.core as core import paddle.v2.framework.core as core
import unittest import unittest
import numpy as np import numpy as np
...@@ -7,10 +8,9 @@ ops = creation.op_creations ...@@ -7,10 +8,9 @@ ops = creation.op_creations
def create_tensor(scope, name, shape): def create_tensor(scope, name, shape):
tensor = scope.create_var(name).get_tensor() tensor = scope.new_var(name).get_tensor()
tensor.set_dims(shape) tensor.set_dims(shape)
tensor.alloc_float() tensor.set(np.random.random(shape), core.CPUPlace())
tensor.set(np.random.random(shape))
return tensor return tensor
...@@ -31,40 +31,36 @@ class TestRNN(unittest.TestCase): ...@@ -31,40 +31,36 @@ class TestRNN(unittest.TestCase):
- h - h
''' '''
input_dim = 30
batch_size = 50
weight_dim = 15
sent_len = 11
def init(self): def init(self):
input_dim = 30
batch_size = 50
weight_dim = 15
self.scope = core.Scope(None)
# create vars
create_tensor(self.scope, "x", [batch_size, input_dim])
create_tensor(self.scope, "W", [input_dim, weight_dim])
create_tensor(self.scope, "U", [weight_dim, weight_dim])
create_tensor(self.scope, "h_boot", [batch_size, weight_dim])
x_alias = "x@alias"
y_alias = "y@alias"
memory = "h@alias"
prememory = "h@pre"
output = "rnn_out"
output_alias = "rnn_out@alias"
# create step net
stepnet_var = self.scope.create_var("stepnet")
stepnet = stepnet_var.get_net()
# stepnet = core.Net.create()
x_fc_op = ops.fc(X=x_alias, W="W", Y="Wx")
h_fc_op = ops.fc(X=prememory, W="U", Y="Uh")
sum_op = ops.add_two(X="Wx", Y="Uh", Out="sum")
sig_op = ops.sigmoid(X="sum", Y=memory)
stepnet.add_op(x_fc_op)
stepnet.add_op(h_fc_op)
stepnet.add_op(sum_op)
stepnet.add_op(sig_op)
stepnet.complete_add_op(True)
self.scope = core.Scope()
self.create_global_variables()
self.create_step_net()
rnn_op = self.create_rnn_op()
ctx = core.DeviceContext.create(core.CPUPlace())
print 'infer_shape'
rnn_op.infer_shape(self.scope)
rnn_op.run(self.scope, ctx)
def create_global_variables(self):
# create inlink
create_tensor(self.scope, "x",
[self.sent_len, self.batch_size, self.input_dim])
create_tensor(self.scope, "W", [self.input_dim, self.input_dim])
create_tensor(self.scope, "U", [self.input_dim, self.input_dim])
create_tensor(self.scope, "h_boot", [self.batch_size, self.input_dim])
self.scope.new_var("step_scopes")
self.scope.new_var("h@alias")
self.scope.new_var("h")
def create_rnn_op(self):
# create RNNOp # create RNNOp
rnnop = ops.recurrent_op( rnnop = ops.recurrent_op(
# inputs # inputs
...@@ -72,17 +68,27 @@ class TestRNN(unittest.TestCase): ...@@ -72,17 +68,27 @@ class TestRNN(unittest.TestCase):
boot_memories=["h_boot"], boot_memories=["h_boot"],
step_net="stepnet", step_net="stepnet",
# outputs # outputs
outlinks=[output], outlinks=["h"],
step_scopes="step_scopes", step_scopes="step_scopes",
# attributes # attributes
inlink_alias=["x@alias"], inlink_alias=["x@alias"],
outlink_alias=[output_alias], outlink_alias=["h@alias"],
pre_memories=[prememory], pre_memories=["h@pre"],
memories=[memory]) memories=["h@alias"])
return rnnop
def create_step_net(self):
var = self.scope.new_var("stepnet")
stepnet = var.get_net()
ctx = core.DeviceContext.cpu_context() x_fc_op = ops.fc(X="x@alias", W="W", Y="Wx")
rnnop.infer_shape(self.scope) h_fc_op = ops.fc(X="h@pre", W="U", Y="Uh")
rnnop.run(self.scope, ctx) sum_op = ops.add_two(X="Wx", Y="Uh", Out="sum")
sig_op = ops.sigmoid(X="sum", Y="h@alias")
for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
stepnet.add_op(op)
stepnet.complete_add_op(True)
def test_recurrent(self): def test_recurrent(self):
self.init() self.init()
......
...@@ -8,9 +8,11 @@ class TestRowwiseAddOp(unittest.TestCase): ...@@ -8,9 +8,11 @@ class TestRowwiseAddOp(unittest.TestCase):
def setUp(self): def setUp(self):
self.type = "rowwise_add" self.type = "rowwise_add"
self.X = np.random.random((32, 84)).astype("float32") self.inputs = {
self.b = np.random.random(84).astype("float32") 'X': np.random.random((32, 84)).astype("float32"),
self.Out = np.add(self.X, self.b) 'b': np.random.random(84).astype("float32")
}
self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])}
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -8,10 +8,13 @@ class TestSGD(unittest.TestCase): ...@@ -8,10 +8,13 @@ class TestSGD(unittest.TestCase):
def setUp(self): def setUp(self):
self.type = "sgd" self.type = "sgd"
self.param = numpy.random.random((102, 105)).astype("float32") w = numpy.random.random((102, 105)).astype("float32")
self.grad = numpy.random.random((102, 105)).astype("float32") g = numpy.random.random((102, 105)).astype("float32")
self.learning_rate = 0.1 lr = 0.1
self.param_out = self.param - self.learning_rate * self.grad
self.inputs = {'param': w, 'grad': g}
self.attrs = {'learning_rate': lr}
self.outputs = {'param_out': w - lr * g}
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -8,9 +8,12 @@ class TestSigmoidOp(unittest.TestCase): ...@@ -8,9 +8,12 @@ class TestSigmoidOp(unittest.TestCase):
def setUp(self): def setUp(self):
self.type = "sigmoid" self.type = "sigmoid"
self.X = np.random.random((32, 100)).astype("float32") self.inputs = {'X': np.random.random((32, 100)).astype("float32")}
self.Y = 1 / (1 + np.exp(-self.X)) self.outputs = {'Y': 1 / (1 + np.exp(-self.inputs['X']))}
#class TestSigmoidGradOp(unittest.TestCase):
#TODO(qingqing) add unit test
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -2,7 +2,7 @@ import unittest ...@@ -2,7 +2,7 @@ import unittest
import numpy as np import numpy as np
import paddle.v2.framework.core as core import paddle.v2.framework.core as core
import paddle.v2.framework.create_op_creation_methods as creation from paddle.v2.framework.op import Operator
from op_test_util import OpTestMeta from op_test_util import OpTestMeta
...@@ -19,13 +19,15 @@ class TestSoftmaxOp(unittest.TestCase): ...@@ -19,13 +19,15 @@ class TestSoftmaxOp(unittest.TestCase):
def setUp(self): def setUp(self):
self.type = "softmax" self.type = "softmax"
self.X = np.random.random((32, 100)).astype("float32") self.inputs = {'X': np.random.random((32, 100)).astype("float32")}
self.Y = np.apply_along_axis(stable_softmax, 1, self.X) self.outputs = {
'Y': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
}
class TestSoftmaxGradOp(unittest.TestCase): class TestSoftmaxGradOp(unittest.TestCase):
def test_softmax_grad(self): def test_softmax_grad(self):
op = creation.op_creations.softmax(X="X", Y="Y") op = Operator('softmax', X="X", Y="Y")
backward_op = core.Operator.backward(op, set()) backward_op = core.Operator.backward(op, set())
self.assertEqual(backward_op.type(), "softmax_grad") self.assertEqual(backward_op.type(), "softmax_grad")
expected = '''Op(softmax_grad), inputs:(X, Y, Y@GRAD), outputs:(X@GRAD).''' expected = '''Op(softmax_grad), inputs:(X, Y, Y@GRAD), outputs:(X@GRAD).'''
......
if (NOT APPLE) if (NOT APPLE)
# The Mac OS X backend will not be able to function correctly if Python is # The Mac OS X backend will not be able to function correctly if Python is
# not installed as a framework. # not installed as a framework.
add_python_test(test_ploter test_ploter.py) py_test(test_ploter SRCS test_ploter.py)
endif() endif()
add_python_test(reader_tests creator_test.py decorator_test.py) py_test(creator_test SRCS creator_test.py)
py_test(decorator_test SRCS decorator_test.py)
add_python_test(test_v2_api test_data_feeder.py test_op.py test_parameters.py py_test(test_op SRCS test_op.py)
test_layer.py test_rnn_layer.py test_topology.py test_image.py) py_test(test_image SRCS test_image.py)
py_test(test_layer SRCS test_layer.py)
py_test(test_topology SRCS test_topology.py)
py_test(test_rnn_layer SRCS test_rnn_layer.py)
py_test(test_parameters SRCS test_parameters.py)
py_test(test_data_feeder SRCS test_data_feeder.py)
from setuptools import setup from setuptools import setup, Distribution
class BinaryDistribution(Distribution):
def has_ext_modules(foo):
return True
packages=['paddle', packages=['paddle',
'paddle.proto', 'paddle.proto',
...@@ -11,33 +15,44 @@ packages=['paddle', ...@@ -11,33 +15,44 @@ packages=['paddle',
'paddle.v2.master', 'paddle.v2.master',
'paddle.v2.plot', 'paddle.v2.plot',
'paddle.v2.framework', 'paddle.v2.framework',
'paddle.v2.framework.proto'] 'paddle.v2.framework.proto',
'py_paddle']
setup_requires=["requests", setup_requires=["requests",
"numpy", "numpy>=1.12",
"protobuf==3.1", "protobuf==3.1",
"recordio", "recordio",
"matplotlib", "matplotlib",
"rarfile", "rarfile",
"scipy>=0.19.0", "scipy>=0.19.0",
"Pillow", "Pillow",
"nltk"] "nltk>=3.2.2"]
if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
setup_requires+=["opencv-python"] setup_requires+=["opencv-python"]
setup(name='paddle', setup(name='paddlepaddle',
version='${PADDLE_VERSION}', version='${PADDLE_VERSION}',
description='Parallel Distributed Deep Learning', description='Parallel Distributed Deep Learning',
install_requires=setup_requires, install_requires=setup_requires,
packages=packages, packages=packages,
package_data={'paddle.v2.master': ['libpaddle_master.so'], package_data={
'paddle.v2.framework': ['core.so'] 'paddle.v2.master': ['libpaddle_master.so'],
'paddle.v2.framework': ['core.so'],
'py_paddle':['*.py','_swig_paddle.so']
}, },
package_dir={ package_dir={
'': '${CMAKE_CURRENT_SOURCE_DIR}', '': '${CMAKE_CURRENT_SOURCE_DIR}',
# The paddle.v2.framework.proto will be generated while compiling. # The paddle.v2.framework.proto will be generated while compiling.
# So that package points to other directory. # So that package points to other directory.
'paddle.v2.framework.proto': '${PROJ_BINARY_ROOT}/paddle/framework' 'paddle.v2.framework.proto': '${PROJ_BINARY_ROOT}/paddle/framework',
'py_paddle': '${PROJ_ROOT}/paddle/py_paddle'
}, },
scripts=['${PROJ_BINARY_ROOT}/paddle/scripts/paddle'],
distclass=BinaryDistribution,
data_files=[('/usr/local/opt/paddle/bin',
['${PROJ_BINARY_ROOT}/paddle/scripts/paddle_usage',
'${PROJ_BINARY_ROOT}/paddle/trainer/paddle_trainer',
'${PROJ_BINARY_ROOT}/paddle/trainer/paddle_merge_model',
'${PROJ_BINARY_ROOT}/paddle/pserver/paddle_pserver_main'])]
) )
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册