提交 59a8ebc6 编写于 作者: C caoying03

Merge branch 'develop' into kmax_score_layer

...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
description: Format files with ClangFormat. description: Format files with ClangFormat.
entry: clang-format -i entry: clang-format -i
language: system language: system
files: \.(c|cc|cxx|cpp|h|hpp|hxx)$ files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang - repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks: hooks:
......
...@@ -36,8 +36,8 @@ include(simd) ...@@ -36,8 +36,8 @@ include(simd)
################################ Configurations ####################################### ################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." OFF) option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND})
option(WITH_MKLML "Compile PaddlePaddle with mklml package." OFF) option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
......
...@@ -27,13 +27,16 @@ RUN apt-get update && \ ...@@ -27,13 +27,16 @@ RUN apt-get update && \
git python-pip python-dev openssh-server bison \ git python-pip python-dev openssh-server bison \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \ curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-numpy python-matplotlib gcc-4.8 g++-4.8 \ python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format-3.8 swig doxygen cmake \ automake locales clang-format-3.8 swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \ liblapack-dev liblapacke-dev libboost-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools && \ net-tools && \
apt-get clean -y apt-get clean -y
# paddle is using numpy.flip, which is introduced since 1.12.0
RUN pip --no-cache-dir install 'numpy>=1.12.0'
# Install Go and glide # Install Go and glide
RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \ RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go.tgz && \ tar -C /usr/local -xzf go.tgz && \
......
...@@ -74,8 +74,6 @@ if(WITH_MKLDNN) ...@@ -74,8 +74,6 @@ if(WITH_MKLDNN)
set(OPENMP_FLAGS "-fopenmp") set(OPENMP_FLAGS "-fopenmp")
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
else() else()
......
...@@ -42,29 +42,21 @@ macro(add_style_check_target TARGET_NAME) ...@@ -42,29 +42,21 @@ macro(add_style_check_target TARGET_NAME)
if(WITH_STYLE_CHECK) if(WITH_STYLE_CHECK)
set(SOURCES_LIST ${ARGN}) set(SOURCES_LIST ${ARGN})
list(REMOVE_DUPLICATES SOURCES_LIST) list(REMOVE_DUPLICATES SOURCES_LIST)
list(SORT SOURCES_LIST)
foreach(filename ${SOURCES_LIST}) foreach(filename ${SOURCES_LIST})
set(LINT ON)
foreach(pattern ${IGNORE_PATTERN}) foreach(pattern ${IGNORE_PATTERN})
if(filename MATCHES ${pattern}) if(filename MATCHES ${pattern})
message(STATUS "DROP LINT ${filename}") list(REMOVE_ITEM SOURCES_LIST ${filename})
set(LINT OFF)
endif() endif()
endforeach() endforeach()
if(LINT MATCHES ON)
# cpplint code style
get_filename_component(base_filename ${filename} NAME)
set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
add_custom_command(OUTPUT ${CUR_GEN} PRE_BUILD
COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
"--filter=${STYLE_FILTER}"
"--write-success=${CUR_GEN}" ${filename}
DEPENDS ${filename} ${PROJ_ROOT}/paddle/scripts/cpplint.py
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
add_custom_target(${base_filename}.cpplint DEPENDS ${CUR_GEN})
add_dependencies(${TARGET_NAME} ${base_filename}.cpplint)
endif()
endforeach() endforeach()
if(SOURCES_LIST)
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
"--filter=${STYLE_FILTER}"
${SOURCES_LIST}
COMMENT "cpplint: Checking source code style"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endif() endif()
endmacro() endmacro()
...@@ -7,7 +7,7 @@ INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any) ...@@ -7,7 +7,7 @@ INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
ExternalProject_Add( ExternalProject_Add(
extern_lib_any extern_lib_any
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/thelink2012/any.git" GIT_REPOSITORY "https://github.com/PaddlePaddle/any.git"
GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020" GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020"
PREFIX ${ANY_SOURCE_DIR} PREFIX ${ANY_SOURCE_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
......
...@@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) ...@@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
ExternalProject_Add( ExternalProject_Add(
extern_gflags extern_gflags
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/gflags/gflags.git" # TODO(yiwang): The annoying warnings mentioned in
# https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
# gflags. I fired a PR https://github.com/gflags/gflags/pull/230
# to fix it. Before it gets accepted by the gflags team, we use
# my personal fork, which contains above fix, temporarily. Let's
# change this back to the official Github repo once my PR is
# merged.
GIT_REPOSITORY "https://github.com/wangkuiyi/gflags.git"
PREFIX ${GFLAGS_SOURCES_DIR} PREFIX ${GFLAGS_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
...@@ -69,8 +69,13 @@ ENDIF(NOT ${CBLAS_FOUND}) ...@@ -69,8 +69,13 @@ ENDIF(NOT ${CBLAS_FOUND})
MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
ADD_LIBRARY(cblas STATIC IMPORTED) # FIXME(gangliao): generate cblas target to track all high performance
SET_PROPERTY(TARGET cblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES}) # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
ADD_LIBRARY(cblas STATIC ${dummyfile})
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
IF(NOT ${CBLAS_FOUND}) IF(NOT ${CBLAS_FOUND})
ADD_DEPENDENCIES(cblas extern_openblas) ADD_DEPENDENCIES(cblas extern_openblas)
LIST(APPEND external_project_dependencies cblas) LIST(APPEND external_project_dependencies cblas)
......
...@@ -24,7 +24,6 @@ IF(WITH_PYTHON) ...@@ -24,7 +24,6 @@ IF(WITH_PYTHON)
ENDIF(WITH_PYTHON) ENDIF(WITH_PYTHON)
SET(py_env "") SET(py_env "")
SET(USE_VIRTUALENV_FOR_TEST 1)
IF(PYTHONINTERP_FOUND) IF(PYTHONINTERP_FOUND)
find_python_module(pip REQUIRED) find_python_module(pip REQUIRED)
find_python_module(numpy REQUIRED) find_python_module(numpy REQUIRED)
......
...@@ -115,7 +115,7 @@ set(COMMON_FLAGS ...@@ -115,7 +115,7 @@ set(COMMON_FLAGS
-Wno-error=literal-suffix -Wno-error=literal-suffix
-Wno-error=sign-compare -Wno-error=sign-compare
-Wno-error=unused-local-typedefs -Wno-error=unused-local-typedefs
-Wno-error=parentheses-equality # Warnings in Pybind11 -Wno-error=parentheses-equality # Warnings in pybind11
) )
set(GPU_COMMON_FLAGS set(GPU_COMMON_FLAGS
...@@ -195,6 +195,7 @@ endif() ...@@ -195,6 +195,7 @@ endif()
# Modern gpu architectures: Pascal # Modern gpu architectures: Pascal
if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0") if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60") list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
endif() endif()
# Custom gpu architecture # Custom gpu architecture
......
...@@ -403,3 +403,16 @@ function(py_proto_compile TARGET_NAME) ...@@ -403,3 +403,16 @@ function(py_proto_compile TARGET_NAME)
protobuf_generate_python(py_srcs ${py_proto_compile_SRCS}) protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs}) add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
endfunction() endfunction()
function(py_test TARGET_NAME)
if(WITH_TESTING)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
python2 ${py_test_SRCS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endfunction()
...@@ -149,9 +149,12 @@ endfunction() ...@@ -149,9 +149,12 @@ endfunction()
# Create a python unittest using run_python_tests.sh, # Create a python unittest using run_python_tests.sh,
# which takes care of making correct running environment # which takes care of making correct running environment
function(add_python_test TEST_NAME) function(add_python_test TEST_NAME)
add_test(NAME ${TEST_NAME} foreach(arg ${ARGN})
COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR} get_filename_component(py_fn ${arg} NAME_WE)
bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh set(TRG_NAME ${TEST_NAME}_${py_fn})
${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN} add_test(NAME ${TRG_NAME}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
python2 ${arg}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endforeach()
endfunction() endfunction()
# Intel® MKL-DNN on PaddlePaddle: Design Doc
我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle,充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。
我们短期内的基本目标是:
- 完成常用layer的MKL-DNN实现。
- 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。
## Contents
- [Overview](#overview)
- [Actions](#actions)
- [CMake](#cmake)
- [Layers](#layers)
- [Activations](#activations)
- [Unit Tests](#unit-tests)
- [Protobuf Messages](#protobuf-messages)
- [Python API](#python-api)
- [Demos](#demos)
- [Benchmarking](#benchmarking)
- [Others](#others)
- [Design Concerns](#design-concerns)
## Overview
我们会把MKL-DNN作为第三方库集成进PaddlePaddle,整体框架图
<div align="center">
<img src="image/overview.png" width=350><br/>
Figure 1. PaddlePaddle on IA.
</div>
## Actions
我们把集成方案大致分为了如下几个方面。
### CMake
我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项,当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
同时,我们会引入`WITH_MKLML`选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。
所以,我们会在`cmake/external`目录新建`mkldnn.cmake``mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。
**备注**:当`WITH_MKLML=ON`的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动`cmake/cblas.cmake`中的逻辑。
### Layers
所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在
`paddle/gserver/layers`中,并且文件名都会一以*Mkldnn*开头。
所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`
### Activations
由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口,实现方法还是会在`ActivationFunction.cpp`文件。
### Unit Tests
会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp``MkldnnTester.*`用于MKL-DNN的测试。
Activation的测试,计划在PaddlePaddle原有的测试文件上直接添加新的测试type。
### Protobuf Messages
根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
### Python API
目前只考虑**v1 API**
计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择,方便用户选择使用MKL-DNN的layers。
具体实现方式比如:
```python
use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
if use_mkldnn
self.layer_type = mkldnn_*
```
所有MKL-DNN的layer type会以*mkldnn_*开头,以示区分。
并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py ``layers.py`里面添加必要的MKL-DNN的接口。
### Demos
会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。
### Benchmarking
会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`,添加使用MKL-DNN的测试。
### Others
1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为64。
2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。
## Design Concerns
为了更好的符合PaddlePaddle的代码风格\[[2](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]
我们总结出一些特别需要注意的点:
1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2``MkldnnLayer`特有的设备ID。
2. 重写父类Layer的**init**函数,修改`deviceId_``-2`,代表这个layer是用于跑在MKL-DNN的环境下。
3. 创建`MkldnnMatrix`,用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
4. 创建`MkldnnBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream``CpuEngine`,和未来可能还会用到`FPGAEngine`等。
5.**Argument**里添加两个`MkldnnMatrixPtr`,取名为`mkldnnValue``mkldnnGrad`,用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名),用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑,用于判断`deviceId`,并针对device在MKL-DNN和CPU之间不统一的情况,做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
## References
1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN")
2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。
3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`,所以不存在这个问题),所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。
...@@ -21,22 +21,15 @@ ...@@ -21,22 +21,15 @@
# #
# It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...} # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
# #
PYPATH=""
if ! python -c "import paddle" >/dev/null 2>/dev/null; then set -x
PYPATH="" while getopts "d:" opt; do
set -x case $opt in
while getopts "d:" opt; do d)
case $opt in PYPATH=$OPTARG
d) ;;
PYPATH=$OPTARG esac
;; done
esac shift $(($OPTIND - 1))
done export PYTHONPATH=$PYPATH:$PYTHONPATH
shift $(($OPTIND - 1)) $@
export PYTHONPATH=$PYPATH:$PYTHONPATH
$@
else
echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'"
exit 1
fi
add_python_test(test_swig_api py_test(testTrain SRCS testTrain.py)
testArguments.py testGradientMachine.py testMatrix.py testVector.py testTrain.py testTrainer.py) py_test(testMatrix SRCS testMatrix.py)
py_test(testVector SRCS testVector.py)
py_test(testTrainer SRCS testTrainer.py)
py_test(testArguments SRCS testArguments.py)
py_test(testGradientMachine SRCS testGradientMachine.py)
...@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_batch_transpose.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_batch_transpose.h"
const int TILE_DIM = 64; const int TILE_DIM = 64;
const int BLOCK_ROWS = 16; const int BLOCK_ROWS = 16;
// No bank-conflict transpose for a batch of data. // No bank-conflict transpose for a batch of data.
__global__ void batchTransposeNoBankConflicts(real* odata, __global__ void batchTransposeNoBankConflicts(
const real* idata, real* odata, const real* idata, int numSamples, int width, int height) {
int numSamples, int width,
int height) {
__shared__ float tile[TILE_DIM][TILE_DIM + 1]; __shared__ float tile[TILE_DIM][TILE_DIM + 1];
const int x = blockIdx.x * TILE_DIM + threadIdx.x; const int x = blockIdx.x * TILE_DIM + threadIdx.x;
...@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata, ...@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
newX] = tile[threadIdx.x][j]; newX] = tile[threadIdx.x][j];
} }
void batchTranspose(const real* input, real* output, int width, int height, void batchTranspose(
int batchSize) { const real* input, real* output, int width, int height, int batchSize) {
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize); dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>> batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
(output, input, batchSize, width, height); output, input, batchSize, width, height);
CHECK_SYNC("batchTranspose failed!"); CHECK_SYNC("batchTranspose failed!");
} }
...@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_aggregate.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_cuda.h" #include "hl_cuda.h"
#include "hl_cuda.ph" #include "hl_cuda.ph"
#include "hl_aggregate.h"
#include "hl_thread.ph"
#include "hl_matrix_base.cuh" #include "hl_matrix_base.cuh"
#include "hl_thread.ph"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
/** /**
* @brief matrix row operator. * @brief matrix row operator.
*/ */
template<class Agg, int blockSize> template <class Agg, int blockSize>
__global__ void KeMatrixRowOp(Agg agg, __global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
real *E,
real *Sum,
int dimN) {
__shared__ real sum_s[blockSize]; __shared__ real sum_s[blockSize];
int cnt = (dimN + blockSize -1) / blockSize; int cnt = (dimN + blockSize - 1) / blockSize;
int rowId = blockIdx.x + blockIdx.y*gridDim.x; int rowId = blockIdx.x + blockIdx.y * gridDim.x;
int index = rowId*dimN; int index = rowId * dimN;
int tid = threadIdx.x; int tid = threadIdx.x;
int lmt = tid; int lmt = tid;
...@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg, ...@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
sum_s[tid] = tmp; sum_s[tid] = tmp;
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]); sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
} }
...@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg, ...@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
} }
template <class Agg> template <class Agg>
void hl_matrix_row_op(Agg agg, void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
real *A_d,
real *C_d,
int dimM,
int dimN) {
int blocksX = dimM; int blocksX = dimM;
int blocksY = 1; int blocksY = 1;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixRowOp<Agg, 128><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimN); agg, A_d, C_d, dimN);
} }
void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) { void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::sum(), hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_sum failed"); CHECK_SYNC("hl_matrix_row_sum failed");
} }
...@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::max(), hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_max failed"); CHECK_SYNC("hl_matrix_row_max failed");
} }
...@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::min(), hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_min failed"); CHECK_SYNC("hl_matrix_row_min failed");
} }
/** /**
* @brief matrix column operator. * @brief matrix column operator.
*/ */
template<class Agg> template <class Agg>
__global__ void KeMatrixColumnOp(Agg agg, __global__ void KeMatrixColumnOp(
real *E, Agg agg, real *E, real *Sum, int dimM, int dimN) {
real *Sum,
int dimM,
int dimN) {
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
real tmp = agg.init(); real tmp = agg.init();
if (rowIdx < dimN) { if (rowIdx < dimN) {
...@@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg, ...@@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg,
} }
} }
template<class Agg, int blockDimX, int blockDimY> template <class Agg, int blockDimX, int blockDimY>
__global__ void KeMatrixColumnOp_S(Agg agg, __global__ void KeMatrixColumnOp_S(
real *E, Agg agg, real *E, real *Sum, int dimM, int dimN) {
real *Sum, __shared__ real _sum[blockDimX * blockDimY];
int dimM, int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int dimN) { int index = threadIdx.y;
__shared__ real _sum[blockDimX*blockDimY];
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int index = threadIdx.y;
real tmp = agg.init(); real tmp = agg.init();
if (rowIdx < dimN) { if (rowIdx < dimN) {
...@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg, ...@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
index += blockDimY; index += blockDimY;
} }
} }
_sum[threadIdx.x + threadIdx.y*blockDimX] = tmp; _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
__syncthreads(); __syncthreads();
if (rowIdx < dimN) { if (rowIdx < dimN) {
if (threadIdx.y ==0) { if (threadIdx.y == 0) {
real tmp = agg.init(); real tmp = agg.init();
for (int i=0; i < blockDimY; i++) { for (int i = 0; i < blockDimY; i++) {
tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]); tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
} }
Sum[rowIdx] = tmp; Sum[rowIdx] = tmp;
} }
...@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg, ...@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
} }
template <class Agg> template <class Agg>
void hl_matrix_column_op(Agg agg, void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
real *A_d,
real *C_d,
int dimM,
int dimN) {
if (dimN >= 8192) { if (dimN >= 8192) {
int blocksX = (dimN + 128 -1) / 128; int blocksX = (dimN + 128 - 1) / 128;
int blocksY = 1; int blocksY = 1;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixColumnOp<Agg><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimM, dimN); agg, A_d, C_d, dimM, dimN);
} else { } else {
int blocksX = (dimN + 32 -1) / 32; int blocksX = (dimN + 32 - 1) / 32;
int blocksY = 1; int blocksY = 1;
dim3 threads(32, 32); dim3 threads(32, 32);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixColumnOp_S<Agg, 32, 32><<< grid, threads, 0, STREAM_DEFAULT>>> KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimM, dimN); agg, A_d, C_d, dimM, dimN);
} }
return; return;
...@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::sum(), hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_sum failed"); CHECK_SYNC("hl_matrix_column_sum failed");
} }
...@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::max(), hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_max failed"); CHECK_SYNC("hl_matrix_column_max failed");
} }
...@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::min(), hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_min failed"); CHECK_SYNC("hl_matrix_column_min failed");
} }
...@@ -226,16 +184,16 @@ template <int blockSize> ...@@ -226,16 +184,16 @@ template <int blockSize>
__global__ void KeVectorSum(real *E, real *Sum, int dimM) { __global__ void KeVectorSum(real *E, real *Sum, int dimM) {
__shared__ double sum_s[blockSize]; __shared__ double sum_s[blockSize];
int tid = threadIdx.x; int tid = threadIdx.x;
int index = blockIdx.y*blockDim.x+threadIdx.x; int index = blockIdx.y * blockDim.x + threadIdx.x;
sum_s[tid] = 0.0f; sum_s[tid] = 0.0f;
while (index < dimM) { while (index < dimM) {
sum_s[tid] += E[index]; sum_s[tid] += E[index];
index += blockDim.x*gridDim.y; index += blockDim.x * gridDim.y;
} }
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] += sum_s[tid + stride]; sum_s[tid] += sum_s[tid + stride];
} }
...@@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) { ...@@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
dim3 threads(blockSize, 1); dim3 threads(blockSize, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
hl_event_t hl_event = &hl_event_st; hl_event_t hl_event = &hl_event_st;
while (!hl_cuda_event_is_ready(hl_event)) {} while (!hl_cuda_event_is_ready(hl_event)) {
}
KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(A_d, t_resource.gpu_mem, dimM); A_d, t_resource.gpu_mem, dimM);
KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
(t_resource.gpu_mem, t_resource.cpu_mem, 128); t_resource.gpu_mem, t_resource.cpu_mem, 128);
hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
hl_stream_synchronize(HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
cudaError_t err = (cudaError_t)hl_get_device_last_error(); cudaError_t err = (cudaError_t)hl_get_device_last_error();
CHECK_EQ(cudaSuccess, err) CHECK_EQ(cudaSuccess, err) << "CUDA error: "
<< "CUDA error: " << hl_get_device_error_string((size_t)err); << hl_get_device_error_string((size_t)err);
} }
template <int blockSize> template <int blockSize>
__global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) { __global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
__shared__ double sum_s[blockSize]; __shared__ double sum_s[blockSize];
int tid = threadIdx.x; int tid = threadIdx.x;
int index = blockIdx.y*blockDim.x+threadIdx.x; int index = blockIdx.y * blockDim.x + threadIdx.x;
sum_s[tid] = 0.0f; sum_s[tid] = 0.0f;
while (index < dimM) { while (index < dimM) {
sum_s[tid] += abs(E[index]); sum_s[tid] += abs(E[index]);
index += blockDim.x*gridDim.y; index += blockDim.x * gridDim.y;
} }
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] += sum_s[tid + stride]; sum_s[tid] += sum_s[tid + stride];
} }
...@@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) { ...@@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
dim3 threads(blockSize, 1); dim3 threads(blockSize, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
hl_event_t hl_event = &hl_event_st; hl_event_t hl_event = &hl_event_st;
while (!hl_cuda_event_is_ready(hl_event)) {} while (!hl_cuda_event_is_ready(hl_event)) {
}
KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(A_d, t_resource.gpu_mem, dimM); A_d, t_resource.gpu_mem, dimM);
KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
(t_resource.gpu_mem, t_resource.cpu_mem, 128); t_resource.gpu_mem, t_resource.cpu_mem, 128);
hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
hl_stream_synchronize(HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
cudaError_t err = (cudaError_t)hl_get_device_last_error(); cudaError_t err = (cudaError_t)hl_get_device_last_error();
CHECK_EQ(cudaSuccess, err) CHECK_EQ(cudaSuccess, err) << "CUDA error: "
<< "CUDA error: " << hl_get_device_error_string((size_t)err); << hl_get_device_error_string((size_t)err);
} }
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -16,36 +16,36 @@ limitations under the License. */ ...@@ -16,36 +16,36 @@ limitations under the License. */
#include "hl_device_functions.cuh" #include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
__global__ void KeMaxSequenceForward(real *input, __global__ void KeMaxSequenceForward(real* input,
const int *sequence, const int* sequence,
real* output, real* output,
int *index, int* index,
int numSequences, int numSequences,
int dim) { int dim) {
int dimIdx = threadIdx.x; int dimIdx = threadIdx.x;
int sequenceId = blockIdx.x; int sequenceId = blockIdx.x;
if (sequenceId >= numSequences) return; if (sequenceId >= numSequences) return;
int start = sequence[sequenceId]; int start = sequence[sequenceId];
int end = sequence[sequenceId+1]; int end = sequence[sequenceId + 1];
for (int i = dimIdx; i < dim; i += blockDim.x) { for (int i = dimIdx; i < dim; i += blockDim.x) {
real tmp = -HL_FLOAT_MAX; real tmp = -HL_FLOAT_MAX;
int tmpId = -1; int tmpId = -1;
for (int insId = start; insId < end; insId++) { for (int insId = start; insId < end; insId++) {
if (tmp < input[insId*dim + i]) { if (tmp < input[insId * dim + i]) {
tmp = input[insId*dim + i]; tmp = input[insId * dim + i];
tmpId = insId; tmpId = insId;
} }
} }
output[sequenceId*dim + i] = tmp; output[sequenceId * dim + i] = tmp;
index[sequenceId*dim + i] = tmpId; index[sequenceId * dim + i] = tmpId;
} }
} }
void hl_max_sequence_forward(real* input, void hl_max_sequence_forward(real* input,
const int* sequence, const int* sequence,
real* output, real* output,
int *index, int* index,
int numSequences, int numSequences,
int dim) { int dim) {
CHECK_NOTNULL(input); CHECK_NOTNULL(input);
...@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input, ...@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
dim3 threads(256, 1); dim3 threads(256, 1);
dim3 grid(numSequences, 1); dim3 grid(numSequences, 1);
KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>> KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, sequence, output, index, numSequences, dim); input, sequence, output, index, numSequences, dim);
CHECK_SYNC("hl_max_sequence_forward failed"); CHECK_SYNC("hl_max_sequence_forward failed");
} }
__global__ void KeMaxSequenceBackward(real *outputGrad, __global__ void KeMaxSequenceBackward(
int *index, real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
real* inputGrad,
int numSequences,
int dim) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
int colIdx = idx % dim; int colIdx = idx % dim;
if (idx < numSequences*dim) { if (idx < numSequences * dim) {
int insId = index[idx]; int insId = index[idx];
inputGrad[insId * dim + colIdx] += outputGrad[idx]; inputGrad[insId * dim + colIdx] += outputGrad[idx];
} }
} }
void hl_max_sequence_backward(real* outputGrad, void hl_max_sequence_backward(
int *index, real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
real* inputGrad,
int numSequences,
int dim) {
CHECK_NOTNULL(outputGrad); CHECK_NOTNULL(outputGrad);
CHECK_NOTNULL(index); CHECK_NOTNULL(index);
CHECK_NOTNULL(inputGrad); CHECK_NOTNULL(inputGrad);
...@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad, ...@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
unsigned int blocks = (numSequences * dim + 128 - 1) / 128; unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocks, 1); dim3 grid(blocks, 1);
KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>> KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
(outputGrad, index, inputGrad, numSequences, dim); outputGrad, index, inputGrad, numSequences, dim);
CHECK_SYNC("hl_max_sequence_backward failed"); CHECK_SYNC("hl_max_sequence_backward failed");
} }
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow> template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output, __global__ void KeMatrixAddRows(real* output,
real* table, real* table,
int* ids, int* ids,
...@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output, ...@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
while (sampleId < numSamples) { while (sampleId < numSamples) {
int tableId = ids[sampleId]; int tableId = ids[sampleId];
if ((0 <= tableId) && (tableId < tableSize)) { if ((0 <= tableId) && (tableId < tableSize)) {
real *outputData = output + sampleId * dim; real* outputData = output + sampleId * dim;
real *tableData = table + tableId * dim; real* tableData = table + tableId * dim;
for (int i = idx; i < dim; i += blockDimX) { for (int i = idx; i < dim; i += blockDimX) {
if (AddRow == 0) { if (AddRow == 0) {
outputData[i] += tableData[i]; outputData[i] += tableData[i];
...@@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output, ...@@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output,
} }
} }
} }
sampleId += blockDimY*gridDimX; sampleId += blockDimY * gridDimX;
} }
} }
template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd> template <int blockDimX,
__global__ int blockDimY,
void KeSequence2Batch(real *batch, int gridDimX,
real *sequence, bool seq2batch,
const int *batchIndex, bool isAdd>
int seqWidth, __global__ void KeSequence2Batch(real* batch,
int batchCount) { real* sequence,
const int* batchIndex,
int seqWidth,
int batchCount) {
int idx = threadIdx.x; int idx = threadIdx.x;
int idy = threadIdx.y; int idy = threadIdx.y;
int id = blockIdx.x + idy * gridDimX; int id = blockIdx.x + idy * gridDimX;
while (id < batchCount) { while (id < batchCount) {
int seqId = batchIndex[id]; int seqId = batchIndex[id];
real* batchData = batch + id*seqWidth; real* batchData = batch + id * seqWidth;
real* seqData = sequence + seqId*seqWidth; real* seqData = sequence + seqId * seqWidth;
for (int i = idx; i < seqWidth; i += blockDimX) { for (int i = idx; i < seqWidth; i += blockDimX) {
if (seq2batch) { if (seq2batch) {
if (isAdd) { if (isAdd) {
...@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch, ...@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
} }
} }
} }
id += blockDimY*gridDimX; id += blockDimY * gridDimX;
} }
} }
void hl_sequence2batch_copy(real *batch, void hl_sequence2batch_copy(real* batch,
real *sequence, real* sequence,
const int *batchIndex, const int* batchIndex,
int seqWidth, int seqWidth,
int batchCount, int batchCount,
bool seq2batch) { bool seq2batch) {
...@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch, ...@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
if (seq2batch) { if (seq2batch) {
KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} else { } else {
KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} }
CHECK_SYNC("hl_sequence2batch_copy failed"); CHECK_SYNC("hl_sequence2batch_copy failed");
} }
void hl_sequence2batch_add(real *batch, void hl_sequence2batch_add(real* batch,
real *sequence, real* sequence,
int *batchIndex, int* batchIndex,
int seqWidth, int seqWidth,
int batchCount, int batchCount,
bool seq2batch) { bool seq2batch) {
...@@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch, ...@@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
if (seq2batch) { if (seq2batch) {
KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} else { } else {
KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} }
CHECK_SYNC("hl_sequence2batch_add failed"); CHECK_SYNC("hl_sequence2batch_add failed");
} }
template<bool normByTimes, bool seq2batch> template <bool normByTimes, bool seq2batch>
__global__ __global__ void KeSequence2BatchPadding(real* batch,
void KeSequence2BatchPadding(real* batch, real* sequence,
real* sequence, const int* sequenceStartPositions,
const int* sequenceStartPositions, const size_t sequenceWidth,
const size_t sequenceWidth, const size_t maxSequenceLength,
const size_t maxSequenceLength, const size_t numSequences) {
const size_t numSequences) {
int batchIdx = blockIdx.y; int batchIdx = blockIdx.y;
int sequenceStart = sequenceStartPositions[batchIdx]; int sequenceStart = sequenceStartPositions[batchIdx];
int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart; int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
...@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch, ...@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
if (seq2batch) { if (seq2batch) {
/* sequence -> batch */ /* sequence -> batch */
if (normByTimes) { if (normByTimes) {
KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} else { } else {
KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} }
} else { } else {
/* batch -> sequence */ /* batch -> sequence */
if (normByTimes) { if (normByTimes) {
KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} else { } else {
KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} }
} }
CHECK_SYNC("hl_sequence2batch_copy_padding failed"); CHECK_SYNC("hl_sequence2batch_copy_padding failed");
} }
__device__ inline float my_rsqrt(float x) { __device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
return rsqrtf(x);
}
__device__ inline double my_rsqrt(double x) { __device__ inline double my_rsqrt(double x) { return rsqrt(x); }
return rsqrt(x);
}
__global__ void KeSequenceAvgForward(real* dst, __global__ void KeSequenceAvgForward(real* dst,
real* src, real* src,
...@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst, ...@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
sum += src[i * width + col]; sum += src[i * width + col];
} }
sum = mode == 1 ? sum : sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
(mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength)); : sum * my_rsqrt((real)seqLength));
dst[gid] += sum; dst[gid] += sum;
} }
} }
...@@ -347,10 +355,10 @@ void hl_sequence_avg_forward(real* dst, ...@@ -347,10 +355,10 @@ void hl_sequence_avg_forward(real* dst,
int grid = DIVUP(width * height, 512); int grid = DIVUP(width * height, 512);
CHECK(mode == 0 || mode == 1 || mode == 2) CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_forward!"; << "mode error in hl_sequence_avg_forward!";
KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>> KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
(dst, src, starts, height, width, mode); dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_forward failed"); CHECK_SYNC("hl_sequence_avg_forward failed");
} }
...@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst, ...@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
int seqLength = end - start; int seqLength = end - start;
if (seqLength == 0) return; if (seqLength == 0) return;
real grad = src[gid]; real grad = src[gid];
grad = mode == 1 ? grad : grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
(mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength)); : grad * my_rsqrt((real)seqLength));
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
dst[i * width + col] += grad; dst[i * width + col] += grad;
} }
...@@ -392,9 +400,9 @@ void hl_sequence_avg_backward(real* dst, ...@@ -392,9 +400,9 @@ void hl_sequence_avg_backward(real* dst,
int grid = DIVUP(width * height, 512); int grid = DIVUP(width * height, 512);
CHECK(mode == 0 || mode == 1 || mode == 2) CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_backward!"; << "mode error in hl_sequence_avg_backward!";
KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>> KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
(dst, src, starts, height, width, mode); dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_backward failed"); CHECK_SYNC("hl_sequence_avg_backward failed");
} }
此差异已折叠。
...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cmath>
#include <stdlib.h> #include <stdlib.h>
#include "hl_cuda.h" #include <cmath>
#include "hl_time.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_cuda.h"
#include "hl_perturbation_util.cuh" #include "hl_perturbation_util.cuh"
#include "hl_time.h"
#define _USE_MATH_DEFINES #define _USE_MATH_DEFINES
...@@ -30,10 +29,16 @@ limitations under the License. */ ...@@ -30,10 +29,16 @@ limitations under the License. */
* centerX, centerY: translation. * centerX, centerY: translation.
* sourceX, sourceY: output coordinates in the original image. * sourceX, sourceY: output coordinates in the original image.
*/ */
__device__ void getTranformCoord(int x, int y, real theta, real scale, __device__ void getTranformCoord(int x,
real tgtCenter, real imgCenter, int y,
real centerR, real centerC, real theta,
int* sourceX, int* sourceY) { real scale,
real tgtCenter,
real imgCenter,
real centerR,
real centerC,
int* sourceX,
int* sourceY) {
real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)}; real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
// compute coornidates in the rotated and scaled image // compute coornidates in the rotated and scaled image
...@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale, ...@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
* created by Wei Xu (genome), converted by Jiang Wang * created by Wei Xu (genome), converted by Jiang Wang
*/ */
__global__ void kSamplingPatches(const real* imgs, real* targets, __global__ void kSamplingPatches(const real* imgs,
int imgSize, int tgtSize, const int channels, real* targets,
int samplingRate, const real* thetas, int imgSize,
const real* scales, const int* centerRs, int tgtSize,
const int* centerCs, const real padValue, const int channels,
int samplingRate,
const real* thetas,
const real* scales,
const int* centerRs,
const int* centerCs,
const real padValue,
const int numImages) { const int numImages) {
const int caseIdx = blockIdx.x * 4 + threadIdx.x; const int caseIdx = blockIdx.x * 4 + threadIdx.x;
const int pxIdx = blockIdx.y * 128 + threadIdx.y; const int pxIdx = blockIdx.y * 128 + threadIdx.y;
...@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, ...@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
const int pxY = pxIdx / tgtSize; const int pxY = pxIdx / tgtSize;
int srcPxX, srcPxY; int srcPxX, srcPxY;
getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter, getTranformCoord(pxX,
imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX, pxY,
thetas[imgIdx],
scales[imgIdx],
tgtCenter,
imgCenter,
centerCs[caseIdx],
centerRs[caseIdx],
&srcPxX,
&srcPxY); &srcPxY);
imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels; imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
...@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, ...@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
* *
* created by Wei Xu * created by Wei Xu
*/ */
void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, void hl_generate_disturb_params(real*& gpuAngle,
int*& gpuCenterR, int*& gpuCenterC, real*& gpuScaleRatio,
int numImages, int imgSize, real rotateAngle, int*& gpuCenterR,
real scaleRatio, int samplingRate, int*& gpuCenterC,
int numImages,
int imgSize,
real rotateAngle,
real scaleRatio,
int samplingRate,
bool isTrain) { bool isTrain) {
// The number of output samples. // The number of output samples.
int numPatches = numImages * samplingRate; int numPatches = numImages * samplingRate;
...@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
for (int i = 0; i < numImages; i++) { for (int i = 0; i < numImages; i++) {
r_angle[i] = r_angle[i] =
(rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT
- 0.5); -
0.5);
s_ratio[i] = s_ratio[i] =
1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT 1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT
} }
...@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
int pxY = int pxY =
(int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT
const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]), const real H[4] = {cos(-r_angle[i]),
sin(-r_angle[i]), cos(-r_angle[i])}; -sin(-r_angle[i]),
sin(-r_angle[i]),
cos(-r_angle[i])};
real x = pxX - imgCenter; real x = pxX - imgCenter;
real y = pxY - imgCenter; real y = pxY - imgCenter;
real xx = H[0] * x + H[1] * y; real xx = H[0] * x + H[1] * y;
...@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
delete[] center_c; delete[] center_c;
} }
void hl_conv_random_disturb_with_params(const real* images, int imgSize, void hl_conv_random_disturb_with_params(const real* images,
int tgtSize, int channels, int imgSize,
int numImages, int samplingRate, int tgtSize,
int channels,
int numImages,
int samplingRate,
const real* gpuRotationAngle, const real* gpuRotationAngle,
const real* gpuScaleRatio, const real* gpuScaleRatio,
const int* gpuCenterR, const int* gpuCenterR,
...@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize, ...@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
dim3 threadsPerBlock(4, 128); dim3 threadsPerBlock(4, 128);
dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128)); dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
kSamplingPatches <<<numBlocks, threadsPerBlock>>> kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
(images, target, imgSize, tgtSize, channels, samplingRate, target,
gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, imgSize,
paddingValue, numImages); tgtSize,
channels,
samplingRate,
gpuRotationAngle,
gpuScaleRatio,
gpuCenterR,
gpuCenterC,
paddingValue,
numImages);
hl_device_synchronize(); hl_device_synchronize();
} }
void hl_conv_random_disturb(const real* images, int imgSize, void hl_conv_random_disturb(const real* images,
int tgtSize, int channels, int numImages, int imgSize,
real scaleRatio, real rotateAngle, int tgtSize,
int samplingRate, real* gpu_r_angle, int channels,
real* gpu_s_ratio, int* gpu_center_r, int numImages,
int* gpu_center_c, int paddingValue, real scaleRatio,
bool isTrain, real* targets) { real rotateAngle,
int samplingRate,
real* gpu_r_angle,
real* gpu_s_ratio,
int* gpu_center_r,
int* gpu_center_c,
int paddingValue,
bool isTrain,
real* targets) {
// generate the random disturbance sequence and the sampling locations // generate the random disturbance sequence and the sampling locations
hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r, hl_generate_disturb_params(gpu_r_angle,
gpu_center_c, numImages, imgSize, rotateAngle, gpu_s_ratio,
scaleRatio, samplingRate, isTrain); gpu_center_r,
gpu_center_c,
hl_conv_random_disturb_with_params( numImages,
images, imgSize, tgtSize, channels, numImages, imgSize,
samplingRate, gpu_r_angle, gpu_s_ratio, rotateAngle,
gpu_center_r, gpu_center_r, paddingValue, scaleRatio,
targets); samplingRate,
isTrain);
hl_conv_random_disturb_with_params(images,
imgSize,
tgtSize,
channels,
numImages,
samplingRate,
gpu_r_angle,
gpu_s_ratio,
gpu_center_r,
gpu_center_r,
paddingValue,
targets);
} }
...@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_cuda.h" #include "hl_cuda.h"
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow> template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output, int ldo, __global__ void KeMatrixAddRows(real* output,
real* table, int ldt, int ldo,
real* table,
int ldt,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
...@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo, ...@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
while (idy < numSamples) { while (idy < numSamples) {
int tableId = ids[idy]; int tableId = ids[idy];
if ((0 <= tableId) && (tableId < tableSize)) { if ((0 <= tableId) && (tableId < tableSize)) {
real *out = output + idy * ldo; real* out = output + idy * ldo;
real *tab = table + tableId * ldt; real* tab = table + tableId * ldt;
for (int i = idx; i < dim; i += blockDimX) { for (int i = idx; i < dim; i += blockDimX) {
if (AddRow) { if (AddRow) {
paddle::paddleAtomicAdd(&tab[i], out[i]); paddle::paddleAtomicAdd(&tab[i], out[i]);
...@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo, ...@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
} }
} }
void hl_matrix_select_rows(real* output, int ldo, void hl_matrix_select_rows(real* output,
real* table, int ldt, int ldo,
real* table,
int ldt,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
...@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo, ...@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, ldo, table, ldt, ids, numSamples, tableSize, dim); output, ldo, table, ldt, ids, numSamples, tableSize, dim);
CHECK_SYNC("hl_matrix_select_rows failed"); CHECK_SYNC("hl_matrix_select_rows failed");
} }
void hl_matrix_add_to_rows(real* table, int ldt, void hl_matrix_add_to_rows(real* table,
real* input, int ldi, int ldt,
real* input,
int ldi,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
...@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt, ...@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, ldi, table, ldt, ids, numSamples, tableSize, dim); input, ldi, table, ldt, ids, numSamples, tableSize, dim);
CHECK_SYNC("hl_matrix_add_to_rows failed"); CHECK_SYNC("hl_matrix_add_to_rows failed");
} }
template<class T, int blockDimX, int gridDimX> template <class T, int blockDimX, int gridDimX>
__global__ void KeVectorSelect(T* dst, int sized, __global__ void KeVectorSelect(
const T* src, int sizes, T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
const int* ids, int sizei) {
int idx = threadIdx.x + blockDimX * blockIdx.x; int idx = threadIdx.x + blockDimX * blockIdx.x;
while (idx < sizei) { while (idx < sizei) {
int index = ids[idx]; int index = ids[idx];
...@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized, ...@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
} }
template <class T> template <class T>
void hl_vector_select_from(T* dst, int sized, void hl_vector_select_from(
const T* src, int sizes, T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
const int* ids, int sizei) {
CHECK_NOTNULL(dst); CHECK_NOTNULL(dst);
CHECK_NOTNULL(src); CHECK_NOTNULL(src);
CHECK_NOTNULL(ids); CHECK_NOTNULL(ids);
...@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized, ...@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
dim3 threads(512, 1); dim3 threads(512, 1);
dim3 grid(8, 1); dim3 grid(8, 1);
KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
(dst, sized, src, sizes, ids, sizei); dst, sized, src, sizes, ids, sizei);
CHECK_SYNC("hl_vector_select_from failed"); CHECK_SYNC("hl_vector_select_from failed");
} }
template template void hl_vector_select_from(real* dst,
void hl_vector_select_from(real* dst, int sized, int sized,
const real* src, int sizes, const real* src,
const int* ids, int sizei); int sizes,
template const int* ids,
void hl_vector_select_from(int* dst, int sized, int sizei);
const int* src, int sizes, template void hl_vector_select_from(
const int* ids, int sizei); int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
此差异已折叠。
...@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
// Attribute Type for paddle's Op. // Attribute Type for paddle's Op.
// Op contains many attributes. Each type of attributes could be different. // Op contains many attributes. Each type of attributes could be different.
// The AttrType will be shared between AttrDesc and AttrProto. // The AttrType will be shared between AttrDesc and AttrProto.
enum AttrType { enum AttrType {
INT = 0; INT = 0;
FLOAT = 1; FLOAT = 1;
STRING = 2; STRING = 2;
INTS = 3; INTS = 3;
FLOATS = 4; FLOATS = 4;
STRINGS = 5; STRINGS = 5;
} }
\ No newline at end of file
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
import "attribute.proto"; import "attribute.proto";
...@@ -22,14 +22,14 @@ import "attribute.proto"; ...@@ -22,14 +22,14 @@ import "attribute.proto";
// //
// e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0 // e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
message AttrDesc { message AttrDesc {
required string name = 1; required string name = 1;
required AttrType type = 2; required AttrType type = 2;
optional int32 i = 3; optional int32 i = 3;
optional float f = 4; optional float f = 4;
optional string s = 5; optional string s = 5;
repeated int32 ints = 6; repeated int32 ints = 6;
repeated float floats = 7; repeated float floats = 7;
repeated string strings = 8; repeated string strings = 8;
}; };
// Protocol Message to describe an Operator. // Protocol Message to describe an Operator.
...@@ -42,15 +42,15 @@ message AttrDesc { ...@@ -42,15 +42,15 @@ message AttrDesc {
// 3rd-party language can build this proto message and call // 3rd-party language can build this proto message and call
// AddOp(const OpDesc& op_desc) of Paddle core to create an Operator. // AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
message OpDesc { message OpDesc {
// input names of this Operator. // input names of this Operator.
repeated string inputs = 1; repeated string inputs = 1;
// output names of this Operator. // output names of this Operator.
repeated string outputs = 2; repeated string outputs = 2;
// type of this Operator, such as "add", "sub", "fc". // type of this Operator, such as "add", "sub", "fc".
required string type = 3; required string type = 3;
// Attributes of this Operator. e.g., scale=3.0 in cosine op. // Attributes of this Operator. e.g., scale=3.0 in cosine op.
repeated AttrDesc attrs = 4; repeated AttrDesc attrs = 4;
}; };
\ No newline at end of file
...@@ -15,10 +15,11 @@ limitations under the License. */ ...@@ -15,10 +15,11 @@ limitations under the License. */
// Protocol Message for 3rd-party language binding. // Protocol Message for 3rd-party language binding.
// //
// Paddle Python package will use `OpProto` to generate op creation methods. // Paddle Python package will use `OpProto` to generate op creation methods.
// The op creation methods take user's input and generate `OpDesc` proto message, // The op creation methods take user's input and generate `OpDesc` proto
// message,
// then pass `OpDesc` to C++ side and create Op pointer. // then pass `OpDesc` to C++ side and create Op pointer.
// //
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
import "attribute.proto"; import "attribute.proto";
...@@ -26,89 +27,90 @@ import "attribute.proto"; ...@@ -26,89 +27,90 @@ import "attribute.proto";
// Attribute protocol message for 3rd-party language binding. // Attribute protocol message for 3rd-party language binding.
// It will store the Op support what attribute and what type. // It will store the Op support what attribute and what type.
message AttrProto { message AttrProto {
// Supported attribute name. e.g. `scale` for cosine op. // Supported attribute name. e.g. `scale` for cosine op.
required string name = 1; required string name = 1;
// Supported attribute type. // Supported attribute type.
required AttrType type = 2; required AttrType type = 2;
// Supported attribute comments. It helps 3rd-party language generate doc-string. // Supported attribute comments. It helps 3rd-party language generate
required string comment = 3; // doc-string.
required string comment = 3;
// If that attribute is generated, it means the Paddle third language // If that attribute is generated, it means the Paddle third language
// binding has responsibility to fill that attribute. End-User should // binding has responsibility to fill that attribute. End-User should
// not set that attribute. // not set that attribute.
optional bool generated = 4 [default=false]; optional bool generated = 4 [ default = false ];
} }
// Input or output message for 3rd-party language binding. // Input or output message for 3rd-party language binding.
// It contains parameter name and its comments. // It contains parameter name and its comments.
message VarProto { message VarProto {
// Input or output name in that op creation function. // Input or output name in that op creation function.
// e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names. // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
required string name = 1; required string name = 1;
// The comment for that input. It helps 3rd-party language generate doc-string. // The comment for that input. It helps 3rd-party language generate
required string comment = 2; // doc-string.
required string comment = 2;
// Is that input/output could be a list or not.
// If so, that Op should write a attributed named `input_format` or // Is that input/output could be a list or not.
// `output_format`. // If so, that Op should write a attributed named `input_format` or
// // `output_format`.
// e.g. //
// If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W` // e.g.
// could be multiple, so the multiple of `X` and `W` is True, and OpDesc // If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
// will hold a attribute of them. // could be multiple, so the multiple of `X` and `W` is True, and OpDesc
// // will hold a attribute of them.
// The Op desc of same fc could be //
// { // The Op desc of same fc could be
// "type": "fc", // {
// "input": ["X1", "X2", "W1", "W2", "b"], // "type": "fc",
// "output": "fc.out", // "input": ["X1", "X2", "W1", "W2", "b"],
// "attrs" : { // "output": "fc.out",
// "input_format": [0, 2, 4, 5] // "attrs" : {
// } // "input_format": [0, 2, 4, 5]
// } // }
// // }
optional bool multiple = 3 [default=false]; //
optional bool multiple = 3 [ default = false ];
// It marks that output is a temporary output. That output is not used by
// user, but used by other op internally as input. If other op is not use // It marks that output is a temporary output. That output is not used by
// that output, it could be optimized early. // user, but used by other op internally as input. If other op is not use
// // that output, it could be optimized early.
// Attribute temporary_index will be set in OpDesc if there is some //
// outputs are temporary. // Attribute temporary_index will be set in OpDesc if there is some
// // outputs are temporary.
// output = [ "xxx.out1", "xxx.tmp", "xxx.out2"], //
// attrs = { // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
// "temporary_index": [1] // attrs = {
// } // "temporary_index": [1]
optional bool temporary = 4 [default=false]; // }
optional bool temporary = 4 [ default = false ];
// The gradient of operator can be ignored immediately
// e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2 // The gradient of operator can be ignored immediately
// can be ignored for the future optimized on graph. // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
optional bool ignore_gradient = 6; // can be ignored for the future optimized on graph.
optional bool ignore_gradient = 6;
} }
// Op protocol message for 3rd-party language binding. // Op protocol message for 3rd-party language binding.
// It contains all information for generating op creation method. // It contains all information for generating op creation method.
message OpProto { message OpProto {
// The input information to generate op creation method. // The input information to generate op creation method.
repeated VarProto inputs = 1; repeated VarProto inputs = 1;
// The output information to generate op creation method. // The output information to generate op creation method.
repeated VarProto outputs = 2; repeated VarProto outputs = 2;
// The attribute information to generate op creation method. // The attribute information to generate op creation method.
repeated AttrProto attrs = 3; repeated AttrProto attrs = 3;
// The comments for that Op. It helps 3rd-party language generate // The comments for that Op. It helps 3rd-party language generate
// doc-string. The whole documentation of that Op is generated by comment, // doc-string. The whole documentation of that Op is generated by comment,
// inputs, outputs, attrs together. // inputs, outputs, attrs together.
required string comment = 4; required string comment = 4;
// The type of that Op.
required string type = 5;
// The type of that Op.
required string type = 5;
} }
...@@ -22,14 +22,14 @@ namespace framework { ...@@ -22,14 +22,14 @@ namespace framework {
template <> template <>
Eigen::DefaultDevice& ExecutionContext::GetEigenDevice< Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
platform::CPUPlace, Eigen::DefaultDevice>() const { platform::CPUPlace, Eigen::DefaultDevice>() const {
return *device_context_.get_eigen_device<Eigen::DefaultDevice>(); return *device_context_->get_eigen_device<Eigen::DefaultDevice>();
} }
#ifndef PADDLE_ONLY_CPU #ifndef PADDLE_ONLY_CPU
template <> template <>
Eigen::GpuDevice& Eigen::GpuDevice&
ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const { ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
return *device_context_.get_eigen_device<Eigen::GpuDevice>(); return *device_context_->get_eigen_device<Eigen::GpuDevice>();
} }
#endif #endif
......
此差异已折叠。
...@@ -157,22 +157,22 @@ class CPUKernalMultiInputsTest : public OpKernel { ...@@ -157,22 +157,22 @@ class CPUKernalMultiInputsTest : public OpKernel {
ASSERT_EQ(xs[2], "x2"); ASSERT_EQ(xs[2], "x2");
auto inVar0 = ctx.MultiInputVar("xs"); auto inVar0 = ctx.MultiInputVar("xs");
ASSERT_EQ(inVar0.size(), 3); ASSERT_EQ(inVar0.size(), 3U);
auto intVar1 = ctx.InputVar("k"); auto intVar1 = ctx.InputVar("k");
ASSERT_NE(intVar1, nullptr); ASSERT_NE(intVar1, nullptr);
auto outVar0 = ctx.MultiOutputVar("ys"); auto outVar0 = ctx.MultiOutputVar("ys");
ASSERT_EQ(outVar0.size(), 2); ASSERT_EQ(outVar0.size(), 2U);
auto inTensor0 = ctx.MultiInput<Tensor>("xs"); auto inTensor0 = ctx.MultiInput<Tensor>("xs");
ASSERT_EQ(inTensor0.size(), 3); ASSERT_EQ(inTensor0.size(), 3U);
auto intTensor1 = ctx.Input<Tensor>("k"); auto intTensor1 = ctx.Input<Tensor>("k");
ASSERT_NE(intTensor1, nullptr); ASSERT_NE(intTensor1, nullptr);
auto outTensor0 = ctx.MultiOutput<Tensor>("ys"); auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
ASSERT_EQ(outTensor0.size(), 2); ASSERT_EQ(outTensor0.size(), 2U);
auto k = ctx.op_.Input("k"); auto k = ctx.op_.Input("k");
ASSERT_EQ(k, "k0"); ASSERT_EQ(k, "k0");
......
...@@ -32,7 +32,7 @@ TEST(BufferTest, SequenceIdArg) { ...@@ -32,7 +32,7 @@ TEST(BufferTest, SequenceIdArg) {
sizeOfValuType(VALUE_TYPE_INT32)); sizeOfValuType(VALUE_TYPE_INT32));
SequenceIdArg buffer(memory.getBuf(), shape); SequenceIdArg buffer(memory.getBuf(), shape);
EXPECT_EQ(buffer.data(), memory.getBuf()); EXPECT_EQ(buffer.data(), memory.getBuf());
EXPECT_EQ(buffer.numSeqs(), 9); EXPECT_EQ(buffer.numSeqs(), 9U);
} }
} // namespace paddle } // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册