提交 d7678df5 编写于 作者: T typhoonzero

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into update_docker_docs

...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
description: Format files with ClangFormat. description: Format files with ClangFormat.
entry: clang-format -i entry: clang-format -i
language: system language: system
files: \.(c|cc|cxx|cpp|h|hpp|hxx)$ files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang - repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks: hooks:
......
...@@ -36,8 +36,8 @@ include(simd) ...@@ -36,8 +36,8 @@ include(simd)
################################ Configurations ####################################### ################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." OFF) option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND})
option(WITH_MKLML "Compile PaddlePaddle with mklml package." OFF) option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
......
...@@ -27,13 +27,16 @@ RUN apt-get update && \ ...@@ -27,13 +27,16 @@ RUN apt-get update && \
git python-pip python-dev openssh-server bison \ git python-pip python-dev openssh-server bison \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \ curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-numpy python-matplotlib gcc-4.8 g++-4.8 \ python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format-3.8 swig doxygen cmake \ automake locales clang-format-3.8 swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \ liblapack-dev liblapacke-dev libboost-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools && \ net-tools && \
apt-get clean -y apt-get clean -y
# paddle is using numpy.flip, which is introduced since 1.12.0
RUN pip --no-cache-dir install 'numpy>=1.12.0'
# Install Go and glide # Install Go and glide
RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \ RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go.tgz && \ tar -C /usr/local -xzf go.tgz && \
......
...@@ -74,8 +74,6 @@ if(WITH_MKLDNN) ...@@ -74,8 +74,6 @@ if(WITH_MKLDNN)
set(OPENMP_FLAGS "-fopenmp") set(OPENMP_FLAGS "-fopenmp")
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
else() else()
......
...@@ -7,7 +7,7 @@ INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any) ...@@ -7,7 +7,7 @@ INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
ExternalProject_Add( ExternalProject_Add(
extern_lib_any extern_lib_any
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/thelink2012/any.git" GIT_REPOSITORY "https://github.com/PaddlePaddle/any.git"
GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020" GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020"
PREFIX ${ANY_SOURCE_DIR} PREFIX ${ANY_SOURCE_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
......
...@@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) ...@@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
ExternalProject_Add( ExternalProject_Add(
extern_gflags extern_gflags
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/gflags/gflags.git" # TODO(yiwang): The annoying warnings mentioned in
# https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
# gflags. I fired a PR https://github.com/gflags/gflags/pull/230
# to fix it. Before it gets accepted by the gflags team, we use
# my personal fork, which contains above fix, temporarily. Let's
# change this back to the official Github repo once my PR is
# merged.
GIT_REPOSITORY "https://github.com/wangkuiyi/gflags.git"
PREFIX ${GFLAGS_SOURCES_DIR} PREFIX ${GFLAGS_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
...@@ -69,8 +69,13 @@ ENDIF(NOT ${CBLAS_FOUND}) ...@@ -69,8 +69,13 @@ ENDIF(NOT ${CBLAS_FOUND})
MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
ADD_LIBRARY(cblas STATIC IMPORTED) # FIXME(gangliao): generate cblas target to track all high performance
SET_PROPERTY(TARGET cblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES}) # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
ADD_LIBRARY(cblas STATIC ${dummyfile})
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
IF(NOT ${CBLAS_FOUND}) IF(NOT ${CBLAS_FOUND})
ADD_DEPENDENCIES(cblas extern_openblas) ADD_DEPENDENCIES(cblas extern_openblas)
LIST(APPEND external_project_dependencies cblas) LIST(APPEND external_project_dependencies cblas)
......
...@@ -24,7 +24,6 @@ IF(WITH_PYTHON) ...@@ -24,7 +24,6 @@ IF(WITH_PYTHON)
ENDIF(WITH_PYTHON) ENDIF(WITH_PYTHON)
SET(py_env "") SET(py_env "")
SET(USE_VIRTUALENV_FOR_TEST 1)
IF(PYTHONINTERP_FOUND) IF(PYTHONINTERP_FOUND)
find_python_module(pip REQUIRED) find_python_module(pip REQUIRED)
find_python_module(numpy REQUIRED) find_python_module(numpy REQUIRED)
......
...@@ -115,7 +115,7 @@ set(COMMON_FLAGS ...@@ -115,7 +115,7 @@ set(COMMON_FLAGS
-Wno-error=literal-suffix -Wno-error=literal-suffix
-Wno-error=sign-compare -Wno-error=sign-compare
-Wno-error=unused-local-typedefs -Wno-error=unused-local-typedefs
-Wno-error=parentheses-equality # Warnings in Pybind11 -Wno-error=parentheses-equality # Warnings in pybind11
) )
set(GPU_COMMON_FLAGS set(GPU_COMMON_FLAGS
...@@ -195,6 +195,7 @@ endif() ...@@ -195,6 +195,7 @@ endif()
# Modern gpu architectures: Pascal # Modern gpu architectures: Pascal
if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0") if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60") list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
endif() endif()
# Custom gpu architecture # Custom gpu architecture
......
...@@ -403,3 +403,16 @@ function(py_proto_compile TARGET_NAME) ...@@ -403,3 +403,16 @@ function(py_proto_compile TARGET_NAME)
protobuf_generate_python(py_srcs ${py_proto_compile_SRCS}) protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs}) add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
endfunction() endfunction()
function(py_test TARGET_NAME)
if(WITH_TESTING)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
python2 ${py_test_SRCS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endfunction()
...@@ -149,9 +149,12 @@ endfunction() ...@@ -149,9 +149,12 @@ endfunction()
# Create a python unittest using run_python_tests.sh, # Create a python unittest using run_python_tests.sh,
# which takes care of making correct running environment # which takes care of making correct running environment
function(add_python_test TEST_NAME) function(add_python_test TEST_NAME)
add_test(NAME ${TEST_NAME} foreach(arg ${ARGN})
COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR} get_filename_component(py_fn ${arg} NAME_WE)
bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh set(TRG_NAME ${TEST_NAME}_${py_fn})
${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN} add_test(NAME ${TRG_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
python2 ${arg}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endforeach()
endfunction() endfunction()
# Intel® MKL-DNN on PaddlePaddle: Design Doc
我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle,充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。
我们短期内的基本目标是:
- 完成常用layer的MKL-DNN实现。
- 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。
## Contents
- [Overview](#overview)
- [Actions](#actions)
- [CMake](#cmake)
- [Layers](#layers)
- [Activations](#activations)
- [Unit Tests](#unit-tests)
- [Protobuf Messages](#protobuf-messages)
- [Python API](#python-api)
- [Demos](#demos)
- [Benchmarking](#benchmarking)
- [Others](#others)
- [Design Concerns](#design-concerns)
## Overview
我们会把MKL-DNN作为第三方库集成进PaddlePaddle,整体框架图
<div align="center">
<img src="image/overview.png" width=350><br/>
Figure 1. PaddlePaddle on IA.
</div>
## Actions
我们把集成方案大致分为了如下几个方面。
### CMake
我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项,当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
同时,我们会引入`WITH_MKLML`选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。
所以,我们会在`cmake/external`目录新建`mkldnn.cmake``mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。
**备注**:当`WITH_MKLML=ON`的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动`cmake/cblas.cmake`中的逻辑。
### Layers
所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在
`paddle/gserver/layers`中,并且文件名都会一以*Mkldnn*开头。
所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`
### Activations
由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口,实现方法还是会在`ActivationFunction.cpp`文件。
### Unit Tests
会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp``MkldnnTester.*`用于MKL-DNN的测试。
Activation的测试,计划在PaddlePaddle原有的测试文件上直接添加新的测试type。
### Protobuf Messages
根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
### Python API
目前只考虑**v1 API**
计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择,方便用户选择使用MKL-DNN的layers。
具体实现方式比如:
```python
use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
if use_mkldnn
self.layer_type = mkldnn_*
```
所有MKL-DNN的layer type会以*mkldnn_*开头,以示区分。
并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py ``layers.py`里面添加必要的MKL-DNN的接口。
### Demos
会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。
### Benchmarking
会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`,添加使用MKL-DNN的测试。
### Others
1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为64。
2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。
## Design Concerns
为了更好的符合PaddlePaddle的代码风格\[[2](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]
我们总结出一些特别需要注意的点:
1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2``MkldnnLayer`特有的设备ID。
2. 重写父类Layer的**init**函数,修改`deviceId_``-2`,代表这个layer是用于跑在MKL-DNN的环境下。
3. 创建`MkldnnMatrix`,用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
4. 创建`MkldnnBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream``CpuEngine`,和未来可能还会用到`FPGAEngine`等。
5.**Argument**里添加两个`MkldnnMatrixPtr`,取名为`mkldnnValue``mkldnnGrad`,用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名),用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑,用于判断`deviceId`,并针对device在MKL-DNN和CPU之间不统一的情况,做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
## References
1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN")
2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。
3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`,所以不存在这个问题),所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。
...@@ -21,22 +21,15 @@ ...@@ -21,22 +21,15 @@
# #
# It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...} # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
# #
PYPATH=""
if ! python -c "import paddle" >/dev/null 2>/dev/null; then set -x
PYPATH="" while getopts "d:" opt; do
set -x
while getopts "d:" opt; do
case $opt in case $opt in
d) d)
PYPATH=$OPTARG PYPATH=$OPTARG
;; ;;
esac esac
done done
shift $(($OPTIND - 1)) shift $(($OPTIND - 1))
export PYTHONPATH=$PYPATH:$PYTHONPATH export PYTHONPATH=$PYPATH:$PYTHONPATH
$@ $@
else
echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'"
exit 1
fi
add_python_test(test_swig_api py_test(testTrain SRCS testTrain.py)
testArguments.py testGradientMachine.py testMatrix.py testVector.py testTrain.py testTrainer.py) py_test(testMatrix SRCS testMatrix.py)
py_test(testVector SRCS testVector.py)
py_test(testTrainer SRCS testTrainer.py)
py_test(testArguments SRCS testArguments.py)
py_test(testGradientMachine SRCS testGradientMachine.py)
...@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_batch_transpose.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_batch_transpose.h"
const int TILE_DIM = 64; const int TILE_DIM = 64;
const int BLOCK_ROWS = 16; const int BLOCK_ROWS = 16;
// No bank-conflict transpose for a batch of data. // No bank-conflict transpose for a batch of data.
__global__ void batchTransposeNoBankConflicts(real* odata, __global__ void batchTransposeNoBankConflicts(
const real* idata, real* odata, const real* idata, int numSamples, int width, int height) {
int numSamples, int width,
int height) {
__shared__ float tile[TILE_DIM][TILE_DIM + 1]; __shared__ float tile[TILE_DIM][TILE_DIM + 1];
const int x = blockIdx.x * TILE_DIM + threadIdx.x; const int x = blockIdx.x * TILE_DIM + threadIdx.x;
...@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata, ...@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
newX] = tile[threadIdx.x][j]; newX] = tile[threadIdx.x][j];
} }
void batchTranspose(const real* input, real* output, int width, int height, void batchTranspose(
int batchSize) { const real* input, real* output, int width, int height, int batchSize) {
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize); dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>> batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
(output, input, batchSize, width, height); output, input, batchSize, width, height);
CHECK_SYNC("batchTranspose failed!"); CHECK_SYNC("batchTranspose failed!");
} }
...@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_aggregate.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_cuda.h" #include "hl_cuda.h"
#include "hl_cuda.ph" #include "hl_cuda.ph"
#include "hl_aggregate.h"
#include "hl_thread.ph"
#include "hl_matrix_base.cuh" #include "hl_matrix_base.cuh"
#include "hl_thread.ph"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
/** /**
* @brief matrix row operator. * @brief matrix row operator.
*/ */
template<class Agg, int blockSize> template <class Agg, int blockSize>
__global__ void KeMatrixRowOp(Agg agg, __global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
real *E,
real *Sum,
int dimN) {
__shared__ real sum_s[blockSize]; __shared__ real sum_s[blockSize];
int cnt = (dimN + blockSize -1) / blockSize; int cnt = (dimN + blockSize - 1) / blockSize;
int rowId = blockIdx.x + blockIdx.y*gridDim.x; int rowId = blockIdx.x + blockIdx.y * gridDim.x;
int index = rowId*dimN; int index = rowId * dimN;
int tid = threadIdx.x; int tid = threadIdx.x;
int lmt = tid; int lmt = tid;
...@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg, ...@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
sum_s[tid] = tmp; sum_s[tid] = tmp;
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]); sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
} }
...@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg, ...@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
} }
template <class Agg> template <class Agg>
void hl_matrix_row_op(Agg agg, void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
real *A_d,
real *C_d,
int dimM,
int dimN) {
int blocksX = dimM; int blocksX = dimM;
int blocksY = 1; int blocksY = 1;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixRowOp<Agg, 128><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimN); agg, A_d, C_d, dimN);
} }
void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) { void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::sum(), hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_sum failed"); CHECK_SYNC("hl_matrix_row_sum failed");
} }
...@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::max(), hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_max failed"); CHECK_SYNC("hl_matrix_row_max failed");
} }
...@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::min(), hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_min failed"); CHECK_SYNC("hl_matrix_row_min failed");
} }
/** /**
* @brief matrix column operator. * @brief matrix column operator.
*/ */
template<class Agg> template <class Agg>
__global__ void KeMatrixColumnOp(Agg agg, __global__ void KeMatrixColumnOp(
real *E, Agg agg, real *E, real *Sum, int dimM, int dimN) {
real *Sum,
int dimM,
int dimN) {
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
real tmp = agg.init(); real tmp = agg.init();
if (rowIdx < dimN) { if (rowIdx < dimN) {
...@@ -127,13 +104,10 @@ __global__ void KeMatrixColumnOp(Agg agg, ...@@ -127,13 +104,10 @@ __global__ void KeMatrixColumnOp(Agg agg,
} }
} }
template<class Agg, int blockDimX, int blockDimY> template <class Agg, int blockDimX, int blockDimY>
__global__ void KeMatrixColumnOp_S(Agg agg, __global__ void KeMatrixColumnOp_S(
real *E, Agg agg, real *E, real *Sum, int dimM, int dimN) {
real *Sum, __shared__ real _sum[blockDimX * blockDimY];
int dimM,
int dimN) {
__shared__ real _sum[blockDimX*blockDimY];
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int index = threadIdx.y; int index = threadIdx.y;
...@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg, ...@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
index += blockDimY; index += blockDimY;
} }
} }
_sum[threadIdx.x + threadIdx.y*blockDimX] = tmp; _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
__syncthreads(); __syncthreads();
if (rowIdx < dimN) { if (rowIdx < dimN) {
if (threadIdx.y ==0) { if (threadIdx.y == 0) {
real tmp = agg.init(); real tmp = agg.init();
for (int i=0; i < blockDimY; i++) { for (int i = 0; i < blockDimY; i++) {
tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]); tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
} }
Sum[rowIdx] = tmp; Sum[rowIdx] = tmp;
} }
...@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg, ...@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
} }
template <class Agg> template <class Agg>
void hl_matrix_column_op(Agg agg, void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
real *A_d,
real *C_d,
int dimM,
int dimN) {
if (dimN >= 8192) { if (dimN >= 8192) {
int blocksX = (dimN + 128 -1) / 128; int blocksX = (dimN + 128 - 1) / 128;
int blocksY = 1; int blocksY = 1;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixColumnOp<Agg><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimM, dimN); agg, A_d, C_d, dimM, dimN);
} else { } else {
int blocksX = (dimN + 32 -1) / 32; int blocksX = (dimN + 32 - 1) / 32;
int blocksY = 1; int blocksY = 1;
dim3 threads(32, 32); dim3 threads(32, 32);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixColumnOp_S<Agg, 32, 32><<< grid, threads, 0, STREAM_DEFAULT>>> KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimM, dimN); agg, A_d, C_d, dimM, dimN);
} }
return; return;
...@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::sum(), hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_sum failed"); CHECK_SYNC("hl_matrix_column_sum failed");
} }
...@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::max(), hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_max failed"); CHECK_SYNC("hl_matrix_column_max failed");
} }
...@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::min(), hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_min failed"); CHECK_SYNC("hl_matrix_column_min failed");
} }
...@@ -226,16 +184,16 @@ template <int blockSize> ...@@ -226,16 +184,16 @@ template <int blockSize>
__global__ void KeVectorSum(real *E, real *Sum, int dimM) { __global__ void KeVectorSum(real *E, real *Sum, int dimM) {
__shared__ double sum_s[blockSize]; __shared__ double sum_s[blockSize];
int tid = threadIdx.x; int tid = threadIdx.x;
int index = blockIdx.y*blockDim.x+threadIdx.x; int index = blockIdx.y * blockDim.x + threadIdx.x;
sum_s[tid] = 0.0f; sum_s[tid] = 0.0f;
while (index < dimM) { while (index < dimM) {
sum_s[tid] += E[index]; sum_s[tid] += E[index];
index += blockDim.x*gridDim.y; index += blockDim.x * gridDim.y;
} }
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] += sum_s[tid + stride]; sum_s[tid] += sum_s[tid + stride];
} }
...@@ -261,36 +219,37 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) { ...@@ -261,36 +219,37 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
hl_event_t hl_event = &hl_event_st; hl_event_t hl_event = &hl_event_st;
while (!hl_cuda_event_is_ready(hl_event)) {} while (!hl_cuda_event_is_ready(hl_event)) {
}
KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(A_d, t_resource.gpu_mem, dimM); A_d, t_resource.gpu_mem, dimM);
KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
(t_resource.gpu_mem, t_resource.cpu_mem, 128); t_resource.gpu_mem, t_resource.cpu_mem, 128);
hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
hl_stream_synchronize(HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
cudaError_t err = (cudaError_t)hl_get_device_last_error(); cudaError_t err = (cudaError_t)hl_get_device_last_error();
CHECK_EQ(cudaSuccess, err) CHECK_EQ(cudaSuccess, err) << "CUDA error: "
<< "CUDA error: " << hl_get_device_error_string((size_t)err); << hl_get_device_error_string((size_t)err);
} }
template <int blockSize> template <int blockSize>
__global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) { __global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
__shared__ double sum_s[blockSize]; __shared__ double sum_s[blockSize];
int tid = threadIdx.x; int tid = threadIdx.x;
int index = blockIdx.y*blockDim.x+threadIdx.x; int index = blockIdx.y * blockDim.x + threadIdx.x;
sum_s[tid] = 0.0f; sum_s[tid] = 0.0f;
while (index < dimM) { while (index < dimM) {
sum_s[tid] += abs(E[index]); sum_s[tid] += abs(E[index]);
index += blockDim.x*gridDim.y; index += blockDim.x * gridDim.y;
} }
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] += sum_s[tid + stride]; sum_s[tid] += sum_s[tid + stride];
} }
...@@ -316,18 +275,19 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) { ...@@ -316,18 +275,19 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
hl_event_t hl_event = &hl_event_st; hl_event_t hl_event = &hl_event_st;
while (!hl_cuda_event_is_ready(hl_event)) {} while (!hl_cuda_event_is_ready(hl_event)) {
}
KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(A_d, t_resource.gpu_mem, dimM); A_d, t_resource.gpu_mem, dimM);
KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
(t_resource.gpu_mem, t_resource.cpu_mem, 128); t_resource.gpu_mem, t_resource.cpu_mem, 128);
hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
hl_stream_synchronize(HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
cudaError_t err = (cudaError_t)hl_get_device_last_error(); cudaError_t err = (cudaError_t)hl_get_device_last_error();
CHECK_EQ(cudaSuccess, err) CHECK_EQ(cudaSuccess, err) << "CUDA error: "
<< "CUDA error: " << hl_get_device_error_string((size_t)err); << hl_get_device_error_string((size_t)err);
} }
此差异已折叠。
此差异已折叠。
...@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
#include "hl_matrix.h" #include "hl_matrix.h"
#include "hl_matrix_ops.cuh"
#include "hl_matrix_apply.cuh" #include "hl_matrix_apply.cuh"
#include "hl_matrix_ops.cuh"
#include "hl_sequence.h" #include "hl_sequence.h"
#include "hl_sparse.ph" #include "hl_sparse.ph"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
DEFINE_MATRIX_UNARY_OP(Zero, a = 0); DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b); DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
void hl_matrix_add(real *A_d, void hl_matrix_add(real* A_d,
real *B_d, real* B_d,
real *C_d, real* C_d,
int dimM, int dimM,
int dimN, int dimN,
real alpha, real alpha,
...@@ -36,8 +35,8 @@ void hl_matrix_add(real *A_d, ...@@ -36,8 +35,8 @@ void hl_matrix_add(real *A_d,
CHECK_NOTNULL(B_d); CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_gpu_apply_ternary_op hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
<real, ternary::_add<real>, 0, 0>(ternary::_add<real>(alpha, beta), ternary::_add<real>(alpha, beta),
A_d, A_d,
B_d, B_d,
C_d, C_d,
...@@ -50,12 +49,11 @@ void hl_matrix_add(real *A_d, ...@@ -50,12 +49,11 @@ void hl_matrix_add(real *A_d,
} }
#ifdef PADDLE_TYPE_DOUBLE #ifdef PADDLE_TYPE_DOUBLE
#define THRESHOLD 128 #define THRESHOLD 128
#else #else
#define THRESHOLD 64 #define THRESHOLD 64
#endif #endif
__device__ __forceinline__ __device__ __forceinline__ void findMax(real* I,
void findMax(real* I,
real* dfMax_s, real* dfMax_s,
int blockSize, int blockSize,
int base, int base,
...@@ -89,8 +87,7 @@ void findMax(real* I, ...@@ -89,8 +87,7 @@ void findMax(real* I,
__syncthreads(); __syncthreads();
} }
__device__ __forceinline__ __device__ __forceinline__ void subMaxAndExp(real* I,
void subMaxAndExp(real* I,
real* O, real* O,
int curIdx, int curIdx,
int nextIdx, int nextIdx,
...@@ -115,8 +112,7 @@ void subMaxAndExp(real* I, ...@@ -115,8 +112,7 @@ void subMaxAndExp(real* I,
__syncthreads(); __syncthreads();
} }
__device__ __forceinline__ __device__ __forceinline__ void valueSum(real* O,
void valueSum(real* O,
real* dfMax_s, real* dfMax_s,
int blockSize, int blockSize,
int base, int base,
...@@ -141,13 +137,8 @@ void valueSum(real* O, ...@@ -141,13 +137,8 @@ void valueSum(real* O,
__syncthreads(); __syncthreads();
} }
__device__ __forceinline__ __device__ __forceinline__ void divSum(
void divSum(real* O, real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
real sum,
int curIdx,
int nextIdx,
int blockSize,
int dimN) {
while (curIdx < dimN) { while (curIdx < dimN) {
O[nextIdx] /= sum; O[nextIdx] /= sum;
nextIdx += blockSize; nextIdx += blockSize;
...@@ -155,8 +146,7 @@ void divSum(real* O, ...@@ -155,8 +146,7 @@ void divSum(real* O,
} }
} }
__device__ __forceinline__ __device__ __forceinline__ void softmax(real* I,
void softmax(real* I,
real* O, real* O,
real* dfMax_s, real* dfMax_s,
int blockSize, int blockSize,
...@@ -167,8 +157,7 @@ void softmax(real* I, ...@@ -167,8 +157,7 @@ void softmax(real* I,
__shared__ real max; __shared__ real max;
// find the max number // find the max number
findMax(I, dfMax_s, blockSize, base, curIdx, findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
nextIdx, dimN, &max);
// sub max Value and do Exp operation // sub max Value and do Exp operation
subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max); subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
...@@ -181,8 +170,8 @@ void softmax(real* I, ...@@ -181,8 +170,8 @@ void softmax(real* I,
divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN); divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
} }
template<int blockSize> template <int blockSize>
__global__ void KeMatrixSoftMax(real *O, real *I, int dimN) { __global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
int base = threadIdx.x; int base = threadIdx.x;
__shared__ real dfMax_s[blockSize]; __shared__ real dfMax_s[blockSize];
int nextIdx = blockIdx.x * dimN + base; int nextIdx = blockIdx.x * dimN + base;
...@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) { ...@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN); softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
} }
void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) { void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
dim3 block(512, 1); dim3 block(512, 1);
dim3 grid(dimM, 1); dim3 grid(dimM, 1);
KeMatrixSoftMax<512> KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
<<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
CHECK_SYNC("hl_matrix_softmax failed"); CHECK_SYNC("hl_matrix_softmax failed");
} }
template<int blockSize> template <int blockSize>
__global__ void KeSequenceSoftMax(real *O, real *I, const int* index) { __global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
int base = threadIdx.x; int base = threadIdx.x;
int bid = blockIdx.x; int bid = blockIdx.x;
__shared__ real dfMax_s[blockSize]; __shared__ real dfMax_s[blockSize];
...@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) { ...@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN); softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
} }
void hl_sequence_softmax_forward(real *A_d, void hl_sequence_softmax_forward(real* A_d,
real *C_d, real* C_d,
const int* index, const int* index,
int numSequence) { int numSequence) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
...@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d, ...@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
dim3 block(512, 1); dim3 block(512, 1);
dim3 grid(numSequence, 1); dim3 grid(numSequence, 1);
KeSequenceSoftMax<512> KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
<<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
CHECK_SYNC("hl_sequence_softmax_forward failed"); CHECK_SYNC("hl_sequence_softmax_forward failed");
} }
__global__ void KeMatrixDerivative(real *grad_d, __global__ void KeMatrixDerivative(
real *output_d, real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
real *sftmaxSum_d, int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int dimM, int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
int dimN) {
int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
int index; int index;
if (rowIdx < dimM && colIdx < dimN) { if (rowIdx < dimM && colIdx < dimN) {
index = rowIdx*dimN + colIdx; index = rowIdx * dimN + colIdx;
grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]); grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
} }
} }
void hl_matrix_softmax_derivative(real *grad_d, void hl_matrix_softmax_derivative(
real *output_d, real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
real *sftmaxSum_d,
int dimM,
int dimN) {
CHECK_NOTNULL(grad_d); CHECK_NOTNULL(grad_d);
CHECK_NOTNULL(output_d); CHECK_NOTNULL(output_d);
CHECK_NOTNULL(sftmaxSum_d); CHECK_NOTNULL(sftmaxSum_d);
int blocksX = (dimM + 0) / 1; int blocksX = (dimM + 0) / 1;
int blocksY = (dimN + 1024 -1) / 1024; int blocksY = (dimN + 1024 - 1) / 1024;
dim3 threads(1, 1024); dim3 threads(1, 1024);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixDerivative<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_d, output_d, sftmaxSum_d, dimM, dimN); grad_d, output_d, sftmaxSum_d, dimM, dimN);
CHECK_SYNC("hl_matrix_softmax_derivative failed"); CHECK_SYNC("hl_matrix_softmax_derivative failed");
} }
__global__ void KeMatrixMultiBinaryCrossEntropy(real* output, __global__ void KeMatrixMultiBinaryCrossEntropy(
real* entropy, real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
int* row,
int* col,
int dimM,
int dimN) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < dimM) { if (index < dimM) {
for (int i = 0; i < dimN; i ++) { for (int i = 0; i < dimN; i++) {
entropy[index] -= log(1 - output[index * dimN + i]); entropy[index] -= log(1 - output[index * dimN + i]);
} }
int *row_col = col + row[index]; int* row_col = col + row[index];
int col_num = row[index + 1] - row[index]; int col_num = row[index + 1] - row[index];
for (int i = 0; i < col_num; i ++) { for (int i = 0; i < col_num; i++) {
real o = output[index * dimN + row_col[i]]; real o = output[index * dimN + row_col[i]];
entropy[index] -= log(o / (1 - o)); entropy[index] -= log(o / (1 - o));
} }
...@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output, ...@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
dim3 threads(n_threads); dim3 threads(n_threads);
dim3 grid(blocks); dim3 grid(blocks);
hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix); hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
KeMatrixMultiBinaryCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, entropy, mat->csr_row, mat->csr_col, dimM, dimN); output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed"); CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
} }
__global__ void KeMatrixMultiBinaryCrossEntropyBp(real* output, __global__ void KeMatrixMultiBinaryCrossEntropyBp(
real* grad, real* output, real* grad, int* row, int* col, int dimM, int dimN) {
int* row,
int* col,
int dimM,
int dimN) {
int row_idx = blockIdx.x * blockDim.x + threadIdx.x; int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (row_idx < dimM) { if (row_idx < dimM) {
for (int i = 0; i < dimN; i ++) { for (int i = 0; i < dimN; i++) {
int index = row_idx * dimN + i; int index = row_idx * dimN + i;
grad[index] += 1.0 / (1 - output[index]); grad[index] += 1.0 / (1 - output[index]);
} }
int col_num = row[row_idx + 1] - row[row_idx]; int col_num = row[row_idx + 1] - row[row_idx];
int *row_col = col + row[row_idx]; int* row_col = col + row[row_idx];
for (int i = 0; i < col_num; i ++) { for (int i = 0; i < col_num; i++) {
int index = row_idx * dimN + row_col[i]; int index = row_idx * dimN + row_col[i];
grad[index] -= 1.0 / (output[index] * (1 - output[index])); grad[index] -= 1.0 / (output[index] * (1 - output[index]));
} }
} }
} }
void hl_matrix_multi_binary_cross_entropy_bp(real* output, void hl_matrix_multi_binary_cross_entropy_bp(
real* grad, real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
hl_sparse_matrix_s csr_mat,
int dimM,
int dimN) {
CHECK_NOTNULL(output); CHECK_NOTNULL(output);
CHECK_NOTNULL(grad); CHECK_NOTNULL(grad);
CHECK_NOTNULL(csr_mat); CHECK_NOTNULL(csr_mat);
...@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output, ...@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
dim3 threads(n_threads); dim3 threads(n_threads);
dim3 grid(blocks); dim3 grid(blocks);
hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix); hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
KeMatrixMultiBinaryCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, grad, mat->csr_row, mat->csr_col, dimM, dimN); output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed"); CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
} }
__global__ void KeMatrixCrossEntropy(real* O, __global__ void KeMatrixCrossEntropy(
real* E, real* O, real* E, int* label, int dimM, int dimN) {
int* label,
int dimM,
int dimN) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
int newBase; int newBase;
if (index < dimM) { if (index < dimM) {
...@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O, ...@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
} }
} }
void hl_matrix_cross_entropy(real* A_d, void hl_matrix_cross_entropy(
real* C_d, real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
int* label_d,
int dimM,
int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
int blocks = (dimM + 1024 - 1) / 1024; int blocks = (dimM + 1024 - 1) / 1024;
dim3 threads(1024, 1); dim3 threads(1024, 1);
dim3 grid(blocks, 1); dim3 grid(blocks, 1);
KeMatrixCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
(A_d, C_d, label_d, dimM, dimN); A_d, C_d, label_d, dimM, dimN);
CHECK_SYNC("hl_matrix_cross_entropy failed"); CHECK_SYNC("hl_matrix_cross_entropy failed");
} }
__global__ void KeMatrixCrossEntropyBp(real* grad_d, __global__ void KeMatrixCrossEntropyBp(
real* output_d, real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
int* label_d, int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int dimM, int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
int dimN) {
int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
int index; int index;
if (rowIdx < dimM && colIdx < dimN) { if (rowIdx < dimM && colIdx < dimN) {
index = rowIdx*dimN + colIdx; index = rowIdx * dimN + colIdx;
if (label_d[rowIdx] == colIdx) { if (label_d[rowIdx] == colIdx) {
grad_d[index] -= 1.0f / output_d[index]; grad_d[index] -= 1.0f / output_d[index];
} }
} }
} }
void hl_matrix_cross_entropy_bp(real* grad_d, void hl_matrix_cross_entropy_bp(
real* output_d, real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
int* label_d,
int dimM,
int dimN) {
CHECK_NOTNULL(grad_d); CHECK_NOTNULL(grad_d);
CHECK_NOTNULL(output_d); CHECK_NOTNULL(output_d);
CHECK_NOTNULL(label_d); CHECK_NOTNULL(label_d);
int blocksX = (dimM + 0)/1; int blocksX = (dimM + 0) / 1;
int blocksY = (dimN + 1024 -1) / 1024; int blocksY = (dimN + 1024 - 1) / 1024;
dim3 threads(1, 1024); dim3 threads(1, 1024);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_d, output_d, label_d, dimM, dimN); grad_d, output_d, label_d, dimM, dimN);
CHECK_SYNC("hl_matrix_cross_entropy_bp failed"); CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
} }
void hl_matrix_zero_mem(real* data, int num) { void hl_matrix_zero_mem(real* data, int num) {
hl_gpu_apply_unary_op( hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
unary::Zero<real>(), data, 1, num, num);
} }
__global__ void KeParamReluForward(real* output, __global__ void KeParamReluForward(real* output,
...@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output, ...@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
int ty = blockIdx.y * blockDim.y + threadIdx.y; int ty = blockIdx.y * blockDim.y + threadIdx.y;
if (tx < width && ty < height) { if (tx < width && ty < height) {
int index = ty * width + tx; int index = ty * width + tx;
output[index] = input[index] > 0 ? input[index] : output[index] =
input[index] * w[tx / partial_sum]; input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
} }
} }
...@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output, ...@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
CHECK_NOTNULL(w); CHECK_NOTNULL(w);
dim3 threads(16, 16); dim3 threads(16, 16);
int blockX = (width + 16 - 1) / 16; int blockX = (width + 16 - 1) / 16;
int blockY = (height + 16 -1) / 16; int blockY = (height + 16 - 1) / 16;
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>> KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, input, w, width, height, partial_sum); output, input, w, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_forward failed"); CHECK_SYNC("hl_param_relu_forward failed");
} }
template<int blockSize> template <int blockSize>
__global__ void KeParamReluBackWardW(real* grad_w, __global__ void KeParamReluBackWardW(real* grad_w,
real* grad_o, real* grad_o,
real* input, real* input,
...@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w, ...@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
int grid_num = width / partial_sum; int grid_num = width / partial_sum;
dim3 threads(blockSize, 1); dim3 threads(blockSize, 1);
dim3 grid(grid_num, 1); dim3 grid(grid_num, 1);
KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>> KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_w, grad_o, input, width, height, partial_sum); grad_w, grad_o, input, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_backward_w failed"); CHECK_SYNC("hl_param_relu_backward_w failed");
} }
...@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o, ...@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
CHECK_NOTNULL(diff); CHECK_NOTNULL(diff);
dim3 threads(16, 16); dim3 threads(16, 16);
int blockX = (width + 16 - 1) / 16; int blockX = (width + 16 - 1) / 16;
int blockY = (height + 16 -1) / 16; int blockY = (height + 16 - 1) / 16;
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>> KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_o, data, w, diff, width, height, partial_sum); grad_o, data, w, diff, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_backward_diff failed"); CHECK_SYNC("hl_param_relu_backward_diff failed");
} }
__global__ void KeMatrixAddSharedBias(real* A, __global__ void KeMatrixAddSharedBias(
real* B, real* A, real* B, const int channel, const int M, const int N, real scale) {
const int channel,
const int M,
const int N,
real scale) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
int dim = N / channel; int dim = N / channel;
if (index < M * N) { if (index < M * N) {
...@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d, ...@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
real scale) { real scale) {
const int blocks = 512; const int blocks = 512;
const int grids = DIVUP(dimM * dimN, blocks); const int grids = DIVUP(dimM * dimN, blocks);
KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>> KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
(A_d, B_d, channel, dimM, dimN, scale); A_d, B_d, channel, dimM, dimN, scale);
CHECK_SYNC("hl_matrix_add_shared_bias failed"); CHECK_SYNC("hl_matrix_add_shared_bias failed");
} }
template <int blockSize> template <int blockSize>
__global__ void KeMatrixCollectSharedBias(real *B, __global__ void KeMatrixCollectSharedBias(real* B,
real *A, real* A,
const int channel, const int channel,
const int M, const int M,
const int N, const int N,
...@@ -611,14 +563,13 @@ void hl_matrix_collect_shared_bias(real* B_d, ...@@ -611,14 +563,13 @@ void hl_matrix_collect_shared_bias(real* B_d,
const int limit = 64; const int limit = 64;
int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel; int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
KeMatrixCollectSharedBias<blocks> KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
<<< grids, blocks, 0, STREAM_DEFAULT>>> B_d, A_d, channel, dimM, dimN, dim, limit, scale);
(B_d, A_d, channel, dimM, dimN, dim, limit, scale);
CHECK_SYNC("hl_matrix_collect_shared_bias failed"); CHECK_SYNC("hl_matrix_collect_shared_bias failed");
} }
__global__ void keMatrixRotate(real* mat, real* matRot, __global__ void keMatrixRotate(
int dimM, int dimN, bool clockWise) { real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
int idx = blockIdx.x * blockDim.x + threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < dimM * dimN) { if (idx < dimM * dimN) {
int i = idx / dimN; int i = idx / dimN;
...@@ -631,13 +582,13 @@ __global__ void keMatrixRotate(real* mat, real* matRot, ...@@ -631,13 +582,13 @@ __global__ void keMatrixRotate(real* mat, real* matRot,
} }
} }
void hl_matrix_rotate(real *mat, real* matRot, void hl_matrix_rotate(
int dimM, int dimN, bool clockWise) { real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
CHECK_NOTNULL(mat); CHECK_NOTNULL(mat);
CHECK_NOTNULL(matRot); CHECK_NOTNULL(matRot);
const int threads = 512; const int threads = 512;
const int blocks = DIVUP(dimM * dimN, threads); const int blocks = DIVUP(dimM * dimN, threads);
keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>> keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
(mat, matRot, dimM, dimN, clockWise); mat, matRot, dimM, dimN, clockWise);
CHECK_SYNC("hl_matrix_rotate failed"); CHECK_SYNC("hl_matrix_rotate failed");
} }
...@@ -16,36 +16,36 @@ limitations under the License. */ ...@@ -16,36 +16,36 @@ limitations under the License. */
#include "hl_device_functions.cuh" #include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
__global__ void KeMaxSequenceForward(real *input, __global__ void KeMaxSequenceForward(real* input,
const int *sequence, const int* sequence,
real* output, real* output,
int *index, int* index,
int numSequences, int numSequences,
int dim) { int dim) {
int dimIdx = threadIdx.x; int dimIdx = threadIdx.x;
int sequenceId = blockIdx.x; int sequenceId = blockIdx.x;
if (sequenceId >= numSequences) return; if (sequenceId >= numSequences) return;
int start = sequence[sequenceId]; int start = sequence[sequenceId];
int end = sequence[sequenceId+1]; int end = sequence[sequenceId + 1];
for (int i = dimIdx; i < dim; i += blockDim.x) { for (int i = dimIdx; i < dim; i += blockDim.x) {
real tmp = -HL_FLOAT_MAX; real tmp = -HL_FLOAT_MAX;
int tmpId = -1; int tmpId = -1;
for (int insId = start; insId < end; insId++) { for (int insId = start; insId < end; insId++) {
if (tmp < input[insId*dim + i]) { if (tmp < input[insId * dim + i]) {
tmp = input[insId*dim + i]; tmp = input[insId * dim + i];
tmpId = insId; tmpId = insId;
} }
} }
output[sequenceId*dim + i] = tmp; output[sequenceId * dim + i] = tmp;
index[sequenceId*dim + i] = tmpId; index[sequenceId * dim + i] = tmpId;
} }
} }
void hl_max_sequence_forward(real* input, void hl_max_sequence_forward(real* input,
const int* sequence, const int* sequence,
real* output, real* output,
int *index, int* index,
int numSequences, int numSequences,
int dim) { int dim) {
CHECK_NOTNULL(input); CHECK_NOTNULL(input);
...@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input, ...@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
dim3 threads(256, 1); dim3 threads(256, 1);
dim3 grid(numSequences, 1); dim3 grid(numSequences, 1);
KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>> KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, sequence, output, index, numSequences, dim); input, sequence, output, index, numSequences, dim);
CHECK_SYNC("hl_max_sequence_forward failed"); CHECK_SYNC("hl_max_sequence_forward failed");
} }
__global__ void KeMaxSequenceBackward(real *outputGrad, __global__ void KeMaxSequenceBackward(
int *index, real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
real* inputGrad,
int numSequences,
int dim) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
int colIdx = idx % dim; int colIdx = idx % dim;
if (idx < numSequences*dim) { if (idx < numSequences * dim) {
int insId = index[idx]; int insId = index[idx];
inputGrad[insId * dim + colIdx] += outputGrad[idx]; inputGrad[insId * dim + colIdx] += outputGrad[idx];
} }
} }
void hl_max_sequence_backward(real* outputGrad, void hl_max_sequence_backward(
int *index, real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
real* inputGrad,
int numSequences,
int dim) {
CHECK_NOTNULL(outputGrad); CHECK_NOTNULL(outputGrad);
CHECK_NOTNULL(index); CHECK_NOTNULL(index);
CHECK_NOTNULL(inputGrad); CHECK_NOTNULL(inputGrad);
...@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad, ...@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
unsigned int blocks = (numSequences * dim + 128 - 1) / 128; unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocks, 1); dim3 grid(blocks, 1);
KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>> KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
(outputGrad, index, inputGrad, numSequences, dim); outputGrad, index, inputGrad, numSequences, dim);
CHECK_SYNC("hl_max_sequence_backward failed"); CHECK_SYNC("hl_max_sequence_backward failed");
} }
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow> template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output, __global__ void KeMatrixAddRows(real* output,
real* table, real* table,
int* ids, int* ids,
...@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output, ...@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
while (sampleId < numSamples) { while (sampleId < numSamples) {
int tableId = ids[sampleId]; int tableId = ids[sampleId];
if ((0 <= tableId) && (tableId < tableSize)) { if ((0 <= tableId) && (tableId < tableSize)) {
real *outputData = output + sampleId * dim; real* outputData = output + sampleId * dim;
real *tableData = table + tableId * dim; real* tableData = table + tableId * dim;
for (int i = idx; i < dim; i += blockDimX) { for (int i = idx; i < dim; i += blockDimX) {
if (AddRow == 0) { if (AddRow == 0) {
outputData[i] += tableData[i]; outputData[i] += tableData[i];
...@@ -114,15 +108,18 @@ __global__ void KeMatrixAddRows(real* output, ...@@ -114,15 +108,18 @@ __global__ void KeMatrixAddRows(real* output,
} }
} }
} }
sampleId += blockDimY*gridDimX; sampleId += blockDimY * gridDimX;
} }
} }
template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd> template <int blockDimX,
__global__ int blockDimY,
void KeSequence2Batch(real *batch, int gridDimX,
real *sequence, bool seq2batch,
const int *batchIndex, bool isAdd>
__global__ void KeSequence2Batch(real* batch,
real* sequence,
const int* batchIndex,
int seqWidth, int seqWidth,
int batchCount) { int batchCount) {
int idx = threadIdx.x; int idx = threadIdx.x;
...@@ -130,8 +127,8 @@ void KeSequence2Batch(real *batch, ...@@ -130,8 +127,8 @@ void KeSequence2Batch(real *batch,
int id = blockIdx.x + idy * gridDimX; int id = blockIdx.x + idy * gridDimX;
while (id < batchCount) { while (id < batchCount) {
int seqId = batchIndex[id]; int seqId = batchIndex[id];
real* batchData = batch + id*seqWidth; real* batchData = batch + id * seqWidth;
real* seqData = sequence + seqId*seqWidth; real* seqData = sequence + seqId * seqWidth;
for (int i = idx; i < seqWidth; i += blockDimX) { for (int i = idx; i < seqWidth; i += blockDimX) {
if (seq2batch) { if (seq2batch) {
if (isAdd) { if (isAdd) {
...@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch, ...@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
} }
} }
} }
id += blockDimY*gridDimX; id += blockDimY * gridDimX;
} }
} }
void hl_sequence2batch_copy(real *batch, void hl_sequence2batch_copy(real* batch,
real *sequence, real* sequence,
const int *batchIndex, const int* batchIndex,
int seqWidth, int seqWidth,
int batchCount, int batchCount,
bool seq2batch) { bool seq2batch) {
...@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch, ...@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
if (seq2batch) { if (seq2batch) {
KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} else { } else {
KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} }
CHECK_SYNC("hl_sequence2batch_copy failed"); CHECK_SYNC("hl_sequence2batch_copy failed");
} }
void hl_sequence2batch_add(real *batch, void hl_sequence2batch_add(real* batch,
real *sequence, real* sequence,
int *batchIndex, int* batchIndex,
int seqWidth, int seqWidth,
int batchCount, int batchCount,
bool seq2batch) { bool seq2batch) {
...@@ -186,18 +183,17 @@ void hl_sequence2batch_add(real *batch, ...@@ -186,18 +183,17 @@ void hl_sequence2batch_add(real *batch,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
if (seq2batch) { if (seq2batch) {
KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} else { } else {
KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} }
CHECK_SYNC("hl_sequence2batch_add failed"); CHECK_SYNC("hl_sequence2batch_add failed");
} }
template<bool normByTimes, bool seq2batch> template <bool normByTimes, bool seq2batch>
__global__ __global__ void KeSequence2BatchPadding(real* batch,
void KeSequence2BatchPadding(real* batch,
real* sequence, real* sequence,
const int* sequenceStartPositions, const int* sequenceStartPositions,
const size_t sequenceWidth, const size_t sequenceWidth,
...@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch, ...@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
if (seq2batch) { if (seq2batch) {
/* sequence -> batch */ /* sequence -> batch */
if (normByTimes) { if (normByTimes) {
KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} else { } else {
KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} }
} else { } else {
/* batch -> sequence */ /* batch -> sequence */
if (normByTimes) { if (normByTimes) {
KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} else { } else {
KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} }
} }
CHECK_SYNC("hl_sequence2batch_copy_padding failed"); CHECK_SYNC("hl_sequence2batch_copy_padding failed");
} }
__device__ inline float my_rsqrt(float x) { __device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
return rsqrtf(x);
}
__device__ inline double my_rsqrt(double x) { __device__ inline double my_rsqrt(double x) { return rsqrt(x); }
return rsqrt(x);
}
__global__ void KeSequenceAvgForward(real* dst, __global__ void KeSequenceAvgForward(real* dst,
real* src, real* src,
...@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst, ...@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
sum += src[i * width + col]; sum += src[i * width + col];
} }
sum = mode == 1 ? sum : sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
(mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength)); : sum * my_rsqrt((real)seqLength));
dst[gid] += sum; dst[gid] += sum;
} }
} }
...@@ -349,8 +357,8 @@ void hl_sequence_avg_forward(real* dst, ...@@ -349,8 +357,8 @@ void hl_sequence_avg_forward(real* dst,
CHECK(mode == 0 || mode == 1 || mode == 2) CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_forward!"; << "mode error in hl_sequence_avg_forward!";
KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>> KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
(dst, src, starts, height, width, mode); dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_forward failed"); CHECK_SYNC("hl_sequence_avg_forward failed");
} }
...@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst, ...@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
int seqLength = end - start; int seqLength = end - start;
if (seqLength == 0) return; if (seqLength == 0) return;
real grad = src[gid]; real grad = src[gid];
grad = mode == 1 ? grad : grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
(mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength)); : grad * my_rsqrt((real)seqLength));
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
dst[i * width + col] += grad; dst[i * width + col] += grad;
} }
...@@ -394,7 +402,7 @@ void hl_sequence_avg_backward(real* dst, ...@@ -394,7 +402,7 @@ void hl_sequence_avg_backward(real* dst,
CHECK(mode == 0 || mode == 1 || mode == 2) CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_backward!"; << "mode error in hl_sequence_avg_backward!";
KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>> KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
(dst, src, starts, height, width, mode); dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_backward failed"); CHECK_SYNC("hl_sequence_avg_backward failed");
} }
此差异已折叠。
...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cmath>
#include <stdlib.h> #include <stdlib.h>
#include "hl_cuda.h" #include <cmath>
#include "hl_time.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_cuda.h"
#include "hl_perturbation_util.cuh" #include "hl_perturbation_util.cuh"
#include "hl_time.h"
#define _USE_MATH_DEFINES #define _USE_MATH_DEFINES
...@@ -30,10 +29,16 @@ limitations under the License. */ ...@@ -30,10 +29,16 @@ limitations under the License. */
* centerX, centerY: translation. * centerX, centerY: translation.
* sourceX, sourceY: output coordinates in the original image. * sourceX, sourceY: output coordinates in the original image.
*/ */
__device__ void getTranformCoord(int x, int y, real theta, real scale, __device__ void getTranformCoord(int x,
real tgtCenter, real imgCenter, int y,
real centerR, real centerC, real theta,
int* sourceX, int* sourceY) { real scale,
real tgtCenter,
real imgCenter,
real centerR,
real centerC,
int* sourceX,
int* sourceY) {
real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)}; real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
// compute coornidates in the rotated and scaled image // compute coornidates in the rotated and scaled image
...@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale, ...@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
* created by Wei Xu (genome), converted by Jiang Wang * created by Wei Xu (genome), converted by Jiang Wang
*/ */
__global__ void kSamplingPatches(const real* imgs, real* targets, __global__ void kSamplingPatches(const real* imgs,
int imgSize, int tgtSize, const int channels, real* targets,
int samplingRate, const real* thetas, int imgSize,
const real* scales, const int* centerRs, int tgtSize,
const int* centerCs, const real padValue, const int channels,
int samplingRate,
const real* thetas,
const real* scales,
const int* centerRs,
const int* centerCs,
const real padValue,
const int numImages) { const int numImages) {
const int caseIdx = blockIdx.x * 4 + threadIdx.x; const int caseIdx = blockIdx.x * 4 + threadIdx.x;
const int pxIdx = blockIdx.y * 128 + threadIdx.y; const int pxIdx = blockIdx.y * 128 + threadIdx.y;
...@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, ...@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
const int pxY = pxIdx / tgtSize; const int pxY = pxIdx / tgtSize;
int srcPxX, srcPxY; int srcPxX, srcPxY;
getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter, getTranformCoord(pxX,
imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX, pxY,
thetas[imgIdx],
scales[imgIdx],
tgtCenter,
imgCenter,
centerCs[caseIdx],
centerRs[caseIdx],
&srcPxX,
&srcPxY); &srcPxY);
imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels; imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
...@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, ...@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
* *
* created by Wei Xu * created by Wei Xu
*/ */
void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, void hl_generate_disturb_params(real*& gpuAngle,
int*& gpuCenterR, int*& gpuCenterC, real*& gpuScaleRatio,
int numImages, int imgSize, real rotateAngle, int*& gpuCenterR,
real scaleRatio, int samplingRate, int*& gpuCenterC,
int numImages,
int imgSize,
real rotateAngle,
real scaleRatio,
int samplingRate,
bool isTrain) { bool isTrain) {
// The number of output samples. // The number of output samples.
int numPatches = numImages * samplingRate; int numPatches = numImages * samplingRate;
...@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
for (int i = 0; i < numImages; i++) { for (int i = 0; i < numImages; i++) {
r_angle[i] = r_angle[i] =
(rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT
- 0.5); -
0.5);
s_ratio[i] = s_ratio[i] =
1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT 1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT
} }
...@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
int pxY = int pxY =
(int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT
const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]), const real H[4] = {cos(-r_angle[i]),
sin(-r_angle[i]), cos(-r_angle[i])}; -sin(-r_angle[i]),
sin(-r_angle[i]),
cos(-r_angle[i])};
real x = pxX - imgCenter; real x = pxX - imgCenter;
real y = pxY - imgCenter; real y = pxY - imgCenter;
real xx = H[0] * x + H[1] * y; real xx = H[0] * x + H[1] * y;
...@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
delete[] center_c; delete[] center_c;
} }
void hl_conv_random_disturb_with_params(const real* images, int imgSize, void hl_conv_random_disturb_with_params(const real* images,
int tgtSize, int channels, int imgSize,
int numImages, int samplingRate, int tgtSize,
int channels,
int numImages,
int samplingRate,
const real* gpuRotationAngle, const real* gpuRotationAngle,
const real* gpuScaleRatio, const real* gpuScaleRatio,
const int* gpuCenterR, const int* gpuCenterR,
...@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize, ...@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
dim3 threadsPerBlock(4, 128); dim3 threadsPerBlock(4, 128);
dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128)); dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
kSamplingPatches <<<numBlocks, threadsPerBlock>>> kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
(images, target, imgSize, tgtSize, channels, samplingRate, target,
gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, imgSize,
paddingValue, numImages); tgtSize,
channels,
samplingRate,
gpuRotationAngle,
gpuScaleRatio,
gpuCenterR,
gpuCenterC,
paddingValue,
numImages);
hl_device_synchronize(); hl_device_synchronize();
} }
void hl_conv_random_disturb(const real* images, int imgSize, void hl_conv_random_disturb(const real* images,
int tgtSize, int channels, int numImages, int imgSize,
real scaleRatio, real rotateAngle, int tgtSize,
int samplingRate, real* gpu_r_angle, int channels,
real* gpu_s_ratio, int* gpu_center_r, int numImages,
int* gpu_center_c, int paddingValue, real scaleRatio,
bool isTrain, real* targets) { real rotateAngle,
int samplingRate,
real* gpu_r_angle,
real* gpu_s_ratio,
int* gpu_center_r,
int* gpu_center_c,
int paddingValue,
bool isTrain,
real* targets) {
// generate the random disturbance sequence and the sampling locations // generate the random disturbance sequence and the sampling locations
hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r, hl_generate_disturb_params(gpu_r_angle,
gpu_center_c, numImages, imgSize, rotateAngle, gpu_s_ratio,
scaleRatio, samplingRate, isTrain); gpu_center_r,
gpu_center_c,
hl_conv_random_disturb_with_params( numImages,
images, imgSize, tgtSize, channels, numImages, imgSize,
samplingRate, gpu_r_angle, gpu_s_ratio, rotateAngle,
gpu_center_r, gpu_center_r, paddingValue, scaleRatio,
samplingRate,
isTrain);
hl_conv_random_disturb_with_params(images,
imgSize,
tgtSize,
channels,
numImages,
samplingRate,
gpu_r_angle,
gpu_s_ratio,
gpu_center_r,
gpu_center_r,
paddingValue,
targets); targets);
} }
...@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_cuda.h" #include "hl_cuda.h"
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow> template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output, int ldo, __global__ void KeMatrixAddRows(real* output,
real* table, int ldt, int ldo,
real* table,
int ldt,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
...@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo, ...@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
while (idy < numSamples) { while (idy < numSamples) {
int tableId = ids[idy]; int tableId = ids[idy];
if ((0 <= tableId) && (tableId < tableSize)) { if ((0 <= tableId) && (tableId < tableSize)) {
real *out = output + idy * ldo; real* out = output + idy * ldo;
real *tab = table + tableId * ldt; real* tab = table + tableId * ldt;
for (int i = idx; i < dim; i += blockDimX) { for (int i = idx; i < dim; i += blockDimX) {
if (AddRow) { if (AddRow) {
paddle::paddleAtomicAdd(&tab[i], out[i]); paddle::paddleAtomicAdd(&tab[i], out[i]);
...@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo, ...@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
} }
} }
void hl_matrix_select_rows(real* output, int ldo, void hl_matrix_select_rows(real* output,
real* table, int ldt, int ldo,
real* table,
int ldt,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
...@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo, ...@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, ldo, table, ldt, ids, numSamples, tableSize, dim); output, ldo, table, ldt, ids, numSamples, tableSize, dim);
CHECK_SYNC("hl_matrix_select_rows failed"); CHECK_SYNC("hl_matrix_select_rows failed");
} }
void hl_matrix_add_to_rows(real* table, int ldt, void hl_matrix_add_to_rows(real* table,
real* input, int ldi, int ldt,
real* input,
int ldi,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
...@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt, ...@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, ldi, table, ldt, ids, numSamples, tableSize, dim); input, ldi, table, ldt, ids, numSamples, tableSize, dim);
CHECK_SYNC("hl_matrix_add_to_rows failed"); CHECK_SYNC("hl_matrix_add_to_rows failed");
} }
template<class T, int blockDimX, int gridDimX> template <class T, int blockDimX, int gridDimX>
__global__ void KeVectorSelect(T* dst, int sized, __global__ void KeVectorSelect(
const T* src, int sizes, T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
const int* ids, int sizei) {
int idx = threadIdx.x + blockDimX * blockIdx.x; int idx = threadIdx.x + blockDimX * blockIdx.x;
while (idx < sizei) { while (idx < sizei) {
int index = ids[idx]; int index = ids[idx];
...@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized, ...@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
} }
template <class T> template <class T>
void hl_vector_select_from(T* dst, int sized, void hl_vector_select_from(
const T* src, int sizes, T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
const int* ids, int sizei) {
CHECK_NOTNULL(dst); CHECK_NOTNULL(dst);
CHECK_NOTNULL(src); CHECK_NOTNULL(src);
CHECK_NOTNULL(ids); CHECK_NOTNULL(ids);
...@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized, ...@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
dim3 threads(512, 1); dim3 threads(512, 1);
dim3 grid(8, 1); dim3 grid(8, 1);
KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
(dst, sized, src, sizes, ids, sizei); dst, sized, src, sizes, ids, sizei);
CHECK_SYNC("hl_vector_select_from failed"); CHECK_SYNC("hl_vector_select_from failed");
} }
template template void hl_vector_select_from(real* dst,
void hl_vector_select_from(real* dst, int sized, int sized,
const real* src, int sizes, const real* src,
const int* ids, int sizei); int sizes,
template const int* ids,
void hl_vector_select_from(int* dst, int sized, int sizei);
const int* src, int sizes, template void hl_vector_select_from(
const int* ids, int sizei); int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
...@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
#include "hl_top_k.h"
#include "hl_sparse.ph" #include "hl_sparse.ph"
#include "hl_top_k.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
// using namespace hppl; // using namespace hppl;
struct Pair { struct Pair {
__device__ __forceinline__ __device__ __forceinline__ Pair() {}
Pair() {}
__device__ __forceinline__ __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
Pair(real value, int id) : v_(value), id_(id) {}
__device__ __forceinline__ __device__ __forceinline__ void set(real value, int id) {
void set(real value, int id) {
v_ = value; v_ = value;
id_ = id; id_ = id;
} }
__device__ __forceinline__ __device__ __forceinline__ void operator=(const Pair& in) {
void operator=(const Pair& in) {
v_ = in.v_; v_ = in.v_;
id_ = in.id_; id_ = in.id_;
} }
__device__ __forceinline__ __device__ __forceinline__ bool operator<(const real value) const {
bool operator<(const real value) const {
return (v_ < value); return (v_ < value);
} }
__device__ __forceinline__ __device__ __forceinline__ bool operator<(const Pair& in) const {
bool operator<(const Pair& in) const {
return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_)); return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
} }
__device__ __forceinline__ __device__ __forceinline__ bool operator>(const Pair& in) const {
bool operator>(const Pair& in) const {
return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_)); return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
} }
...@@ -58,8 +50,9 @@ struct Pair { ...@@ -58,8 +50,9 @@ struct Pair {
int id_; int id_;
}; };
__device__ __forceinline__ __device__ __forceinline__ void addTo(Pair topK[],
void addTo(Pair topK[], const Pair &p, int beamSize) { const Pair& p,
int beamSize) {
for (int k = beamSize - 2; k >= 0; k--) { for (int k = beamSize - 2; k >= 0; k--) {
if (topK[k] < p) { if (topK[k] < p) {
topK[k + 1] = topK[k]; topK[k + 1] = topK[k];
...@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) { ...@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
topK[0] = p; topK[0] = p;
} }
template<int beamSize> template <int beamSize>
__device__ __forceinline__ __device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
void addTo(Pair topK[], const Pair &p) {
for (int k = beamSize - 2; k >= 0; k--) { for (int k = beamSize - 2; k >= 0; k--) {
if (topK[k] < p) { if (topK[k] < p) {
topK[k + 1] = topK[k]; topK[k + 1] = topK[k];
...@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) { ...@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
topK[0] = p; topK[0] = p;
} }
template<int blockSize> template <int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void getTopK(
void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) { Pair topK[], real* src, int idx, int dim, int beamSize) {
while (idx < dim) { while (idx < dim) {
if (topK[beamSize - 1] < src[idx]) { if (topK[beamSize - 1] < src[idx]) {
Pair tmp(src[idx], idx); Pair tmp(src[idx], idx);
...@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) { ...@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
} }
} }
template<int blockSize> template <int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void getTopK(
void getTopK(Pair topK[], real *src, int idx, int dim, Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
const Pair& max, int beamSize) {
while (idx < dim) { while (idx < dim) {
if (topK[beamSize - 1] < src[idx]) { if (topK[beamSize - 1] < src[idx]) {
Pair tmp(src[idx], idx); Pair tmp(src[idx], idx);
...@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, ...@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
} }
} }
template<int blockSize> template <int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void getTopK(
void getTopK(Pair topK[], real *val, int *col, Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
int idx, int dim, int beamSize) {
while (idx < dim) { while (idx < dim) {
if (topK[beamSize - 1] < val[idx]) { if (topK[beamSize - 1] < val[idx]) {
Pair tmp(val[idx], col[idx]); Pair tmp(val[idx], col[idx]);
...@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col, ...@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
} }
} }
template<int blockSize> template <int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void getTopK(Pair topK[],
void getTopK(Pair topK[], real *val, int *col, int idx, int dim, real* val,
const Pair& max, int beamSize) { int* col,
int idx,
int dim,
const Pair& max,
int beamSize) {
while (idx < dim) { while (idx < dim) {
if (topK[beamSize - 1] < val[idx]) { if (topK[beamSize - 1] < val[idx]) {
Pair tmp(val[idx], col[idx]); Pair tmp(val[idx], col[idx]);
...@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim, ...@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
} }
} }
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void threadGetTopK(Pair topK[],
void threadGetTopK(Pair topK[], int& beam, int beamSize, int& beam,
int beamSize,
real* src, real* src,
bool& firstStep, bool& isEmpty, Pair& max, bool& firstStep,
int dim, const int tid) { bool& isEmpty,
Pair& max,
int dim,
const int tid) {
if (beam > 0) { if (beam > 0) {
int length = beam < beamSize ? beam : beamSize; int length = beam < beamSize ? beam : beamSize;
if (firstStep) { if (firstStep) {
...@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, ...@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
} }
} }
if (!isEmpty) { if (!isEmpty) {
getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
max, length);
} }
} }
...@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, ...@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
} }
} }
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void threadGetTopK(Pair topK[],
void threadGetTopK(Pair topK[], int& beam, int beamSize, int& beam,
real* val, int* col, int beamSize,
bool& firstStep, bool& isEmpty, Pair& max, real* val,
int dim, const int tid) { int* col,
bool& firstStep,
bool& isEmpty,
Pair& max,
int dim,
const int tid) {
if (beam > 0) { if (beam > 0) {
int length = beam < beamSize ? beam : beamSize; int length = beam < beamSize ? beam : beamSize;
if (firstStep) { if (firstStep) {
...@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, ...@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
} }
} }
if (!isEmpty) { if (!isEmpty) {
getTopK<blockSize>(topK + maxLength - beam, val, col, tid, dim, getTopK<blockSize>(
max, length); topK + maxLength - beam, val, col, tid, dim, max, length);
} }
} }
...@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, ...@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
} }
} }
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void blockReduce(Pair* shTopK,
void blockReduce(Pair* shTopK, int* maxId, Pair topK[], int* maxId,
real** topVal, int** topIds, Pair topK[],
int& beam, int& beamSize, real** topVal,
const int tid, const int warp) { int** topIds,
int& beam,
int& beamSize,
const int tid,
const int warp) {
while (true) { while (true) {
__syncthreads(); __syncthreads();
if (tid < blockSize / 2) { if (tid < blockSize / 2) {
...@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[], ...@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
} }
} }
__syncthreads(); __syncthreads();
for (int stride = blockSize / 4; stride > 0; stride = stride/2) { for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) { if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
maxId[tid] = maxId[tid + stride]; maxId[tid] = maxId[tid + stride];
...@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[], ...@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
* 3. go to the second setp, until one thread's topK value is null; * 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value. * 4. go to the first setp, until get the topK value.
*/ */
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__global__ void KeMatrixTopK(real* topVal, int ldv, __global__ void KeMatrixTopK(real* topVal,
int * topIds, int ldv,
real* src, int lds, int* topIds,
real* src,
int lds,
int dim, int dim,
int beamSize) { int beamSize) {
__shared__ Pair shTopK[blockSize]; __shared__ Pair shTopK[blockSize];
...@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv, ...@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
topK[k].set(-HL_FLOAT_MAX, -1); topK[k].set(-HL_FLOAT_MAX, -1);
} }
while (beamSize) { while (beamSize) {
threadGetTopK<maxLength, blockSize> threadGetTopK<maxLength, blockSize>(
(topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0]; shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize> blockReduce<maxLength, blockSize>(
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
} }
} }
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__global__ void KeSMatrixTopK(real* topVal, int ldv, __global__ void KeSMatrixTopK(real* topVal,
int * topIds, int ldv,
int* topIds,
real* val, real* val,
int* row, int* row,
int* col, int* col,
...@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv, ...@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
topK[k].set(-HL_FLOAT_MAX, -1); topK[k].set(-HL_FLOAT_MAX, -1);
} }
while (beamSize) { while (beamSize) {
threadGetTopK<maxLength, blockSize> threadGetTopK<maxLength, blockSize>(
(topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid); topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0]; shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize> blockReduce<maxLength, blockSize>(
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
} }
} }
void hl_matrix_top_k(real* topVal, int ldv, void hl_matrix_top_k(real* topVal,
int * topIds, int ldv,
real* src, int lds, int* topIds,
real* src,
int lds,
int dim, int dim,
int beamSize, int beamSize,
int numSamples) { int numSamples) {
...@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv, ...@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
dim3 threads(256, 1); dim3 threads(256, 1);
dim3 grid(numSamples, 1); dim3 grid(numSamples, 1);
KeMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
(topVal, ldv, topIds, src, lds, dim, beamSize); topVal, ldv, topIds, src, lds, dim, beamSize);
CHECK_SYNC("hl_matrix_top_k failed"); CHECK_SYNC("hl_matrix_top_k failed");
} }
void hl_sparse_matrix_top_k(real* topVal, int ldv, void hl_sparse_matrix_top_k(real* topVal,
int * topIds, int ldv,
int* topIds,
hl_sparse_matrix_s src, hl_sparse_matrix_s src,
int beamSize, int beamSize,
int numSamples) { int numSamples) {
CHECK_NOTNULL(topVal); CHECK_NOTNULL(topVal);
CHECK_NOTNULL(topIds); CHECK_NOTNULL(topIds);
CHECK_NOTNULL(src); CHECK_NOTNULL(src);
CHECK_EQ(src->format, HL_SPARSE_CSR) CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
<<"sparse matrix format error!";
hl_csr_matrix csr = (hl_csr_matrix)src->matrix; hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
if (csr->csr_val == NULL || csr->csr_row == NULL || if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
csr->csr_col == NULL) {
LOG(FATAL) << "parameter src is null!"; LOG(FATAL) << "parameter src is null!";
} }
dim3 threads(256, 1); dim3 threads(256, 1);
dim3 grid(numSamples, 1); dim3 grid(numSamples, 1);
KeSMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>> KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
(topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize); topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
CHECK_SYNC("hl_sparse_matrix_top_k failed"); CHECK_SYNC("hl_sparse_matrix_top_k failed");
} }
...@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv, ...@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
* 3. go to the second setp, until one thread's topK value is null; * 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value. * 4. go to the first setp, until get the topK value.
*/ */
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, __global__ void KeMatrixTopKClassificationError(real* topVal,
int * topIds, int ldv,
real* src, int lds, int* topIds,
real* src,
int lds,
int dim, int dim,
int beamSize, int beamSize,
int* label, int* label,
...@@ -420,12 +432,12 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, ...@@ -420,12 +432,12 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
} }
while (beamSize) { while (beamSize) {
threadGetTopK<maxLength, blockSize> threadGetTopK<maxLength, blockSize>(
(topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0]; shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize> blockReduce<maxLength, blockSize>(
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
} }
__syncthreads(); __syncthreads();
...@@ -440,9 +452,11 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, ...@@ -440,9 +452,11 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
} }
} }
void hl_matrix_classification_error(real* topVal, int ldv, void hl_matrix_classification_error(real* topVal,
int ldv,
int* topIds, int* topIds,
real* src, int lds, real* src,
int lds,
int dim, int dim,
int topkSize, int topkSize,
int numSamples, int numSamples,
...@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv, ...@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
dim3 threads(256, 1); dim3 threads(256, 1);
dim3 grid(numSamples, 1); dim3 grid(numSamples, 1);
KeMatrixTopKClassificationError<5, 256> KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<< grid, threads, 0, STREAM_DEFAULT >>> topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
(topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
CHECK_SYNC("hl_matrix_top_k classification error failed"); CHECK_SYNC("hl_matrix_top_k classification error failed");
} }
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
// Attribute Type for paddle's Op. // Attribute Type for paddle's Op.
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
import "attribute.proto"; import "attribute.proto";
......
...@@ -15,10 +15,11 @@ limitations under the License. */ ...@@ -15,10 +15,11 @@ limitations under the License. */
// Protocol Message for 3rd-party language binding. // Protocol Message for 3rd-party language binding.
// //
// Paddle Python package will use `OpProto` to generate op creation methods. // Paddle Python package will use `OpProto` to generate op creation methods.
// The op creation methods take user's input and generate `OpDesc` proto message, // The op creation methods take user's input and generate `OpDesc` proto
// message,
// then pass `OpDesc` to C++ side and create Op pointer. // then pass `OpDesc` to C++ side and create Op pointer.
// //
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
import "attribute.proto"; import "attribute.proto";
...@@ -32,13 +33,14 @@ message AttrProto { ...@@ -32,13 +33,14 @@ message AttrProto {
// Supported attribute type. // Supported attribute type.
required AttrType type = 2; required AttrType type = 2;
// Supported attribute comments. It helps 3rd-party language generate doc-string. // Supported attribute comments. It helps 3rd-party language generate
// doc-string.
required string comment = 3; required string comment = 3;
// If that attribute is generated, it means the Paddle third language // If that attribute is generated, it means the Paddle third language
// binding has responsibility to fill that attribute. End-User should // binding has responsibility to fill that attribute. End-User should
// not set that attribute. // not set that attribute.
optional bool generated = 4 [default=false]; optional bool generated = 4 [ default = false ];
} }
// Input or output message for 3rd-party language binding. // Input or output message for 3rd-party language binding.
...@@ -48,7 +50,8 @@ message VarProto { ...@@ -48,7 +50,8 @@ message VarProto {
// e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names. // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
required string name = 1; required string name = 1;
// The comment for that input. It helps 3rd-party language generate doc-string. // The comment for that input. It helps 3rd-party language generate
// doc-string.
required string comment = 2; required string comment = 2;
// Is that input/output could be a list or not. // Is that input/output could be a list or not.
...@@ -70,7 +73,7 @@ message VarProto { ...@@ -70,7 +73,7 @@ message VarProto {
// } // }
// } // }
// //
optional bool multiple = 3 [default=false]; optional bool multiple = 3 [ default = false ];
// It marks that output is a temporary output. That output is not used by // It marks that output is a temporary output. That output is not used by
// user, but used by other op internally as input. If other op is not use // user, but used by other op internally as input. If other op is not use
...@@ -83,7 +86,7 @@ message VarProto { ...@@ -83,7 +86,7 @@ message VarProto {
// attrs = { // attrs = {
// "temporary_index": [1] // "temporary_index": [1]
// } // }
optional bool temporary = 4 [default=false]; optional bool temporary = 4 [ default = false ];
// The gradient of operator can be ignored immediately // The gradient of operator can be ignored immediately
// e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2 // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
...@@ -110,5 +113,4 @@ message OpProto { ...@@ -110,5 +113,4 @@ message OpProto {
// The type of that Op. // The type of that Op.
required string type = 5; required string type = 5;
} }
...@@ -22,14 +22,14 @@ namespace framework { ...@@ -22,14 +22,14 @@ namespace framework {
template <> template <>
Eigen::DefaultDevice& ExecutionContext::GetEigenDevice< Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
platform::CPUPlace, Eigen::DefaultDevice>() const { platform::CPUPlace, Eigen::DefaultDevice>() const {
return *device_context_.get_eigen_device<Eigen::DefaultDevice>(); return *device_context_->get_eigen_device<Eigen::DefaultDevice>();
} }
#ifndef PADDLE_ONLY_CPU #ifndef PADDLE_ONLY_CPU
template <> template <>
Eigen::GpuDevice& Eigen::GpuDevice&
ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const { ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
return *device_context_.get_eigen_device<Eigen::GpuDevice>(); return *device_context_->get_eigen_device<Eigen::GpuDevice>();
} }
#endif #endif
......
...@@ -174,7 +174,11 @@ class OperatorContext { ...@@ -174,7 +174,11 @@ class OperatorContext {
template <typename T> template <typename T>
T* Output(const size_t index) const { T* Output(const size_t index) const {
auto var = OutputVar(index); auto var = OutputVar(index);
PADDLE_ENFORCE(var != nullptr, "Output(%d) should not be nullptr", index); PADDLE_ENFORCE(
var != nullptr,
"Output(%d) not be nullptr, which means variable [%s] does not "
"exist in scope",
index, op_.outputs_[index]);
return var->GetMutable<T>(); return var->GetMutable<T>();
} }
...@@ -252,7 +256,7 @@ struct EigenDeviceConverter<platform::GPUPlace> { ...@@ -252,7 +256,7 @@ struct EigenDeviceConverter<platform::GPUPlace> {
class ExecutionContext : public OperatorContext { class ExecutionContext : public OperatorContext {
public: public:
ExecutionContext(const OperatorBase* op, const Scope& scope, ExecutionContext(const OperatorBase* op, const Scope& scope,
const platform::DeviceContext& device_context) const platform::DeviceContext* device_context)
: OperatorContext(op, scope), device_context_(device_context) {} : OperatorContext(op, scope), device_context_(device_context) {}
template <typename PlaceType, template <typename PlaceType,
...@@ -260,9 +264,9 @@ class ExecutionContext : public OperatorContext { ...@@ -260,9 +264,9 @@ class ExecutionContext : public OperatorContext {
typename EigenDeviceConverter<PlaceType>::EigenDeviceType> typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
DeviceType& GetEigenDevice() const; DeviceType& GetEigenDevice() const;
platform::Place GetPlace() const { return device_context_.GetPlace(); } platform::Place GetPlace() const { return device_context_->GetPlace(); }
const platform::DeviceContext& device_context_; const platform::DeviceContext* device_context_;
}; };
class OpKernel { class OpKernel {
...@@ -311,7 +315,7 @@ class OperatorWithKernel : public OperatorBase { ...@@ -311,7 +315,7 @@ class OperatorWithKernel : public OperatorBase {
void Run(const Scope& scope, void Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const final { const platform::DeviceContext& dev_ctx) const final {
auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx)); auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
opKernel->Compute(ExecutionContext(this, scope, dev_ctx)); opKernel->Compute(ExecutionContext(this, scope, &dev_ctx));
} }
static std::unordered_map<std::string /* op_type */, OpKernelMap>& static std::unordered_map<std::string /* op_type */, OpKernelMap>&
......
...@@ -157,22 +157,22 @@ class CPUKernalMultiInputsTest : public OpKernel { ...@@ -157,22 +157,22 @@ class CPUKernalMultiInputsTest : public OpKernel {
ASSERT_EQ(xs[2], "x2"); ASSERT_EQ(xs[2], "x2");
auto inVar0 = ctx.MultiInputVar("xs"); auto inVar0 = ctx.MultiInputVar("xs");
ASSERT_EQ(inVar0.size(), 3); ASSERT_EQ(inVar0.size(), 3U);
auto intVar1 = ctx.InputVar("k"); auto intVar1 = ctx.InputVar("k");
ASSERT_NE(intVar1, nullptr); ASSERT_NE(intVar1, nullptr);
auto outVar0 = ctx.MultiOutputVar("ys"); auto outVar0 = ctx.MultiOutputVar("ys");
ASSERT_EQ(outVar0.size(), 2); ASSERT_EQ(outVar0.size(), 2U);
auto inTensor0 = ctx.MultiInput<Tensor>("xs"); auto inTensor0 = ctx.MultiInput<Tensor>("xs");
ASSERT_EQ(inTensor0.size(), 3); ASSERT_EQ(inTensor0.size(), 3U);
auto intTensor1 = ctx.Input<Tensor>("k"); auto intTensor1 = ctx.Input<Tensor>("k");
ASSERT_NE(intTensor1, nullptr); ASSERT_NE(intTensor1, nullptr);
auto outTensor0 = ctx.MultiOutput<Tensor>("ys"); auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
ASSERT_EQ(outTensor0.size(), 2); ASSERT_EQ(outTensor0.size(), 2U);
auto k = ctx.op_.Input("k"); auto k = ctx.op_.Input("k");
ASSERT_EQ(k, "k0"); ASSERT_EQ(k, "k0");
......
...@@ -18,10 +18,10 @@ limitations under the License. */ ...@@ -18,10 +18,10 @@ limitations under the License. */
namespace paddle { namespace paddle {
TEST(BlockExpandForward, real) { TEST(BlockExpandForward, real) {
for (size_t batchSize : {5, 32}) { for (size_t batchSize : {5}) {
for (size_t channels : {1, 5, 32}) { for (size_t channels : {1, 5}) {
for (size_t inputHeight : {5, 33, 100}) { for (size_t inputHeight : {5, 33}) {
for (size_t inputWidth : {5, 32, 96}) { for (size_t inputWidth : {5, 32}) {
for (size_t block : {1, 3, 5}) { for (size_t block : {1, 3, 5}) {
for (size_t stride : {1, 2}) { for (size_t stride : {1, 2}) {
for (size_t padding : {0, 1}) { for (size_t padding : {0, 1}) {
...@@ -61,10 +61,10 @@ TEST(BlockExpandForward, real) { ...@@ -61,10 +61,10 @@ TEST(BlockExpandForward, real) {
} }
TEST(BlockExpandBackward, real) { TEST(BlockExpandBackward, real) {
for (size_t batchSize : {5, 32}) { for (size_t batchSize : {5}) {
for (size_t channels : {1, 5, 32}) { for (size_t channels : {1, 5}) {
for (size_t inputHeight : {5, 33, 100}) { for (size_t inputHeight : {5, 33}) {
for (size_t inputWidth : {5, 32, 96}) { for (size_t inputWidth : {5, 32}) {
for (size_t block : {1, 3, 5}) { for (size_t block : {1, 3, 5}) {
for (size_t stride : {1, 2}) { for (size_t stride : {1, 2}) {
for (size_t padding : {0, 1}) { for (size_t padding : {0, 1}) {
......
...@@ -32,7 +32,7 @@ TEST(BufferTest, SequenceIdArg) { ...@@ -32,7 +32,7 @@ TEST(BufferTest, SequenceIdArg) {
sizeOfValuType(VALUE_TYPE_INT32)); sizeOfValuType(VALUE_TYPE_INT32));
SequenceIdArg buffer(memory.getBuf(), shape); SequenceIdArg buffer(memory.getBuf(), shape);
EXPECT_EQ(buffer.data(), memory.getBuf()); EXPECT_EQ(buffer.data(), memory.getBuf());
EXPECT_EQ(buffer.numSeqs(), 9); EXPECT_EQ(buffer.numSeqs(), 9U);
} }
} // namespace paddle } // namespace paddle
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "ContextProjectionOp.h" #include "ContextProjectionOp.h"
#include "hl_base.h"
namespace paddle { namespace paddle {
...@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input, ...@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
int block_size = blockDim.x; int block_size = blockDim.x;
int sequenceId = blockIdx.x; int sequenceId = blockIdx.x;
int seq_start = sequence[sequenceId]; int seq_start = sequence[sequenceId];
int seq_end = sequence[sequenceId+1]; int seq_end = sequence[sequenceId + 1];
real value = 0; real value = 0;
int instances = seq_end - seq_start + context_length - 1; int instances = seq_end - seq_start + context_length - 1;
...@@ -50,7 +50,8 @@ __global__ void KeContextProjectionForward(const real* input, ...@@ -50,7 +50,8 @@ __global__ void KeContextProjectionForward(const real* input,
if (padding) { if (padding) {
value = value =
weight[(begin_pad + i + context_start - (seq_end - seq_start)) * weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
input_dim + idx]; input_dim +
idx];
} else { } else {
continue; continue;
} }
...@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input, ...@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
dim3 grid(blocks_x, blocks_y); dim3 grid(blocks_x, blocks_y);
if (weight) { if (weight) {
KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>> KeContextProjectionForward<true><<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, sequence, weight, output, input_dim, input,
context_length, context_start, begin_pad); sequence,
weight,
output,
input_dim,
context_length,
context_start,
begin_pad);
} else { } else {
KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>> KeContextProjectionForward<false><<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, sequence, weight, output, input_dim, input,
context_length, context_start, begin_pad); sequence,
weight,
output,
input_dim,
context_length,
context_start,
begin_pad);
} }
CHECK_SYNC("hl_context_projection_forward failed"); CHECK_SYNC("hl_context_projection_forward failed");
} }
...@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad, ...@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
int block_size = blockDim.x; int block_size = blockDim.x;
int sequenceId = blockIdx.x; int sequenceId = blockIdx.x;
int seq_start = sequence[sequenceId]; int seq_start = sequence[sequenceId];
int seq_end = sequence[sequenceId+1]; int seq_end = sequence[sequenceId + 1];
real value = 0; real value = 0;
int instances = seq_end - seq_start + context_length - 1; int instances = seq_end - seq_start + context_length - 1;
...@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad, ...@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
int blocks_y = 1; int blocks_y = 1;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(blocks_x, blocks_y); dim3 grid(blocks_x, blocks_y);
KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>> KeContextProjectionBackwardData<<<grid, threads, 0, STREAM_DEFAULT>>>(
(out_grad, sequence, input_grad, input_dim, context_length, context_start); out_grad, sequence, input_grad, input_dim, context_length, context_start);
CHECK_SYNC("hl_context_projection_backward_data failed"); CHECK_SYNC("hl_context_projection_backward_data failed");
} }
...@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad, ...@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
context_start); context_start);
} }
template<int THREADS_X, int THREADS_Y> template <int THREADS_X, int THREADS_Y>
__global__ void KeContextProjectionBackwardWeight(const real* out_grad, __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
const int* sequence, const int* sequence,
real* w_grad, real* w_grad,
...@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, ...@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
if (weight_idx < w_dim) { if (weight_idx < w_dim) {
for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) { for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
int seq_start = sequence[seqId]; int seq_start = sequence[seqId];
int seq_end = sequence[seqId+1]; int seq_end = sequence[seqId + 1];
output_r = const_cast<real*>(out_grad) output_r =
+ seq_start * w_dim * context_length; const_cast<real*>(out_grad) + seq_start * w_dim * context_length;
if (context_start < 0) { if (context_start < 0) {
if (padId + context_start < 0) { if (padId + context_start < 0) {
instanceId = padId; instanceId = padId;
} else { } else {
// begin_pad > 0; // begin_pad > 0;
instanceId = (padId - begin_pad) + instanceId =
(seq_end - seq_start) - context_start; (padId - begin_pad) + (seq_end - seq_start) - context_start;
} }
} else { } else {
if (padId + (seq_end - seq_start) < context_start) { if (padId + (seq_end - seq_start) < context_start) {
...@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, ...@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
} }
} }
int outx = (instanceId - context_length) < 0 ? int outx =
instanceId : (context_length - 1); (instanceId - context_length) < 0 ? instanceId : (context_length - 1);
int outy = (instanceId - context_length) < 0 ? int outy = (instanceId - context_length) < 0
0 : (instanceId - (context_length - 1)); ? 0
: (instanceId - (context_length - 1));
output_r += outy * w_dim * context_length + outx * w_dim; output_r += outy * w_dim * context_length + outx * w_dim;
for (int j = outy; j < seq_end - seq_start; j++) { for (int j = outy; j < seq_end - seq_start; j++) {
value += output_r[weight_idx]; value += output_r[weight_idx];
...@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, ...@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
} }
__syncthreads(); __syncthreads();
for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) { for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) {
if (idy < stride) { if (idy < stride) {
sum_s[idy][idx] += sum_s[idy + stride][idx]; sum_s[idy][idx] += sum_s[idy + stride][idx];
} }
...@@ -339,16 +353,21 @@ void hl_context_projection_backward_weight(const real* out_grad, ...@@ -339,16 +353,21 @@ void hl_context_projection_backward_weight(const real* out_grad,
dim3 threads(threads_x, threads_y); dim3 threads(threads_x, threads_y);
dim3 grid(blocks_x, 1); dim3 grid(blocks_x, 1);
KeContextProjectionBackwardWeight<32, 32> KeContextProjectionBackwardWeight<32,
<<< grid, threads, 0, STREAM_DEFAULT >>> 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
(out_grad, sequence, w_grad, num_sequences, w_dim, out_grad,
context_length, context_start, begin_pad); sequence,
w_grad,
num_sequences,
w_dim,
context_length,
context_start,
begin_pad);
CHECK_SYNC("hl_context_projection_backward_weight failed"); CHECK_SYNC("hl_context_projection_backward_weight failed");
} }
template <> template <>
void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>( void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
const GpuMatrix& out_grad,
GpuMatrix& w_grad, GpuMatrix& w_grad,
const GpuIVector& seq_vec, const GpuIVector& seq_vec,
size_t context_length, size_t context_length,
...@@ -378,15 +397,10 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad, ...@@ -378,15 +397,10 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
size_t total_pad) { size_t total_pad) {
if (in_grad) { if (in_grad) {
ContextProjectionBackwardData<DEVICE_TYPE_GPU>( ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
out_grad, out_grad, in_grad, sequence, context_length, context_start);
in_grad,
sequence,
context_length,
context_start);
} }
if (is_padding && w_grad) { if (is_padding && w_grad) {
ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>( ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(out_grad,
out_grad,
w_grad, w_grad,
sequence, sequence,
context_length, context_length,
......
...@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "CosSimOp.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_device_functions.cuh" #include "hl_device_functions.cuh"
#include "CosSimOp.h"
namespace paddle { namespace paddle {
template<int block_size> template <int block_size>
__global__ void KeCosSim(real* output, __global__ void KeCosSim(real* output,
const real* input1, const real* input1,
const real* input2, const real* input2,
...@@ -78,8 +78,8 @@ void hlCossim(real* output, ...@@ -78,8 +78,8 @@ void hlCossim(real* output,
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(1, input1_height); dim3 grid(1, input1_height);
KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>> KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, input1, input2, width, input1_height, input2_height, scale); output, input1, input2, width, input1_height, input2_height, scale);
CHECK_SYNC("hlCossim failed"); CHECK_SYNC("hlCossim failed");
} }
...@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat, ...@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale); hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
} }
template<int block_size> template <int block_size>
__global__ void KeCosSimDerivative(const real* grad, __global__ void KeCosSimDerivative(const real* grad,
const real* output, const real* output,
const real* prev_out_x, const real* prev_out_x,
...@@ -148,13 +148,12 @@ __global__ void KeCosSimDerivative(const real* grad, ...@@ -148,13 +148,12 @@ __global__ void KeCosSimDerivative(const real* grad,
if (xy[0] == 0) { if (xy[0] == 0) {
real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0])); real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
for (int index = tid; index < width; index += block_size) { for (int index = tid; index < width; index += block_size) {
prev_grad_x[index] += prev_grad_x[index] += scale * grad[ty] * prev_out_y[index] * reciprocal;
scale * grad[ty] * prev_out_y[index] * reciprocal;
if (input2_height > 1) { if (input2_height > 1) {
prev_grad_y[index] += prev_grad_y[index] += scale * grad[ty] * prev_out_x[index] * reciprocal;
scale * grad[ty] * prev_out_x[index] * reciprocal;
} else { } else {
paddle::paddleAtomicAdd(prev_grad_y + index, paddle::paddleAtomicAdd(
prev_grad_y + index,
scale * grad[ty] * prev_out_x[index] * reciprocal); scale * grad[ty] * prev_out_x[index] * reciprocal);
} }
} }
...@@ -163,16 +162,17 @@ __global__ void KeCosSimDerivative(const real* grad, ...@@ -163,16 +162,17 @@ __global__ void KeCosSimDerivative(const real* grad,
real reciprocalSquareSumX = 1.0 / xx[0]; real reciprocalSquareSumX = 1.0 / xx[0];
real reciprocalSquareSumY = 1.0 / yy[0]; real reciprocalSquareSumY = 1.0 / yy[0];
for (int index = tid; index < width; index += block_size) { for (int index = tid; index < width; index += block_size) {
prev_grad_x[index] += output[ty] * grad[ty] * prev_grad_x[index] +=
(prev_out_y[index] * reciprocalXY - output[ty] * grad[ty] * (prev_out_y[index] * reciprocalXY -
prev_out_x[index] * reciprocalSquareSumX); prev_out_x[index] * reciprocalSquareSumX);
if (input2_height > 1) { if (input2_height > 1) {
prev_grad_y[index] += output[ty] * grad[ty] * prev_grad_y[index] +=
(prev_out_x[index] * reciprocalXY - output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
prev_out_y[index] * reciprocalSquareSumY); prev_out_y[index] * reciprocalSquareSumY);
} else { } else {
paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] * paddle::paddleAtomicAdd(
(prev_out_x[index] * reciprocalXY - prev_grad_y + index,
output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
prev_out_y[index] * reciprocalSquareSumY)); prev_out_y[index] * reciprocalSquareSumY));
} }
} }
...@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad, ...@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
const int block_size = 256; const int block_size = 256;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(1, input1_height); dim3 grid(1, input1_height);
KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>> KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width, grad,
input1_height, input2_height, scale); output,
prev_out_x,
prev_out_y,
prev_grad_x,
prev_grad_y,
width,
input1_height,
input2_height,
scale);
CHECK_SYNC("hlCossimDerivate failed"); CHECK_SYNC("hlCossimDerivate failed");
} }
...@@ -214,8 +222,8 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad, ...@@ -214,8 +222,8 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
real scale) { real scale) {
CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() && CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
in2_val.getData() && in1_grad.getData() && in2_grad.getData()); in2_val.getData() && in1_grad.getData() && in2_grad.getData());
CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ &&
&& in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_) in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
<< "Matrix types are not equally GPU"; << "Matrix types are not equally GPU";
size_t dim = in1_val.getWidth(); size_t dim = in1_val.getWidth();
......
...@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "CropOp.h" #include "CropOp.h"
#include "hl_base.h"
namespace paddle { namespace paddle {
__global__ void KeCrop(real* outputs, const real* inputs, __global__ void KeCrop(real* outputs,
int inC, int inH, int inW, const real* inputs,
int cropC, int cropH, int cropW, int inC,
int outC, int outH, int outW, int nthreads) { int inH,
int inW,
int cropC,
int cropH,
int cropW,
int outC,
int outH,
int outW,
int nthreads) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) { if (idx < nthreads) {
const int w = idx % outW; const int w = idx % outW;
...@@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs, ...@@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
int blockSize = 1024; int blockSize = 1024;
int gridSize = (nth + blockSize - 1) / blockSize; int gridSize = (nth + blockSize - 1) / blockSize;
KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
(outputs, inputs, inC, inH, inW, cropC, cropH, cropW, inputs,
outC, outH, outW, nth); inC,
inH,
inW,
cropC,
cropH,
cropW,
outC,
outH,
outW,
nth);
CHECK_SYNC("Crop"); CHECK_SYNC("Crop");
} }
__global__ void KeCropDiff(const real* inGrad, real* outGrad, __global__ void KeCropDiff(const real* inGrad,
int inC, int inH, int inW, real* outGrad,
int cropC, int cropH, int cropW, int inC,
int outC, int outH, int outW, int nthreads) { int inH,
int inW,
int cropC,
int cropH,
int cropW,
int outC,
int outH,
int outW,
int nthreads) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) { if (idx < nthreads) {
const int w = idx % inW; const int w = idx % inW;
...@@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad, ...@@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
int blockSize = 1024; int blockSize = 1024;
int gridSize = (nth + blockSize - 1) / blockSize; int gridSize = (nth + blockSize - 1) / blockSize;
KeCropDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCropDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
(inGrad, outGrad, inC, inH, inW, cropC, cropH, cropW, outGrad,
outC, outH, outW, nth); inC,
inH,
inW,
cropC,
cropH,
cropW,
outC,
outH,
outW,
nth);
CHECK_SYNC("CropGrad"); CHECK_SYNC("CropGrad");
} }
......
此差异已折叠。
此差异已折叠。
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "MulOp.h" #include "MulOp.h"
#include "hl_base.h"
#include "paddle/math/Matrix.h" #include "paddle/math/Matrix.h"
#include "paddle/math/SparseMatrix.h" #include "paddle/math/SparseMatrix.h"
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -57,8 +57,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, ...@@ -57,8 +57,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
convGradFilterType = "GemmConvGradFilter"; convGradFilterType = "GemmConvGradFilter";
} }
if (FLAGS_use_nnpack) { if (FLAGS_use_nnpack && !isDeconv_) {
CHECK_EQ(isDeconv_, false);
createFunction(forward_, createFunction(forward_,
"NNPACKConv", "NNPACKConv",
FuncConfig() FuncConfig()
......
此差异已折叠。
...@@ -29,7 +29,7 @@ public: ...@@ -29,7 +29,7 @@ public:
vals.push_back(s.str()); vals.push_back(s.str());
} }
size_t pos = 0; size_t pos = 0;
int i = 0; size_t i = 0;
std::ostringstream s; std::ostringstream s;
const std::string& format = config_.user_arg(); const std::string& format = config_.user_arg();
while (true) { while (true) {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册