提交 4bc1dc26 编写于 作者: T tensor-tang

Merge remote-tracking branch 'upstream/develop' into merge-doc

......@@ -24,7 +24,7 @@
description: Format files with ClangFormat.
entry: clang-format -i
language: system
files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks:
......
......@@ -36,8 +36,8 @@ include(simd)
################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND})
option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND})
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." OFF)
option(WITH_MKLML "Compile PaddlePaddle with mklml package." OFF)
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
......
......@@ -27,13 +27,16 @@ RUN apt-get update && \
git python-pip python-dev openssh-server bison \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-numpy python-matplotlib gcc-4.8 g++-4.8 \
python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format-3.8 swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools && \
apt-get clean -y
# paddle is using numpy.flip, which is introduced since 1.12.0
RUN pip --no-cache-dir install 'numpy>=1.12.0'
# Install Go and glide
RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go.tgz && \
......
......@@ -56,11 +56,14 @@ macro(add_style_check_target TARGET_NAME)
# cpplint code style
get_filename_component(base_filename ${filename} NAME)
set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
add_custom_command(OUTPUT ${CUR_GEN} PRE_BUILD
COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
"--filter=${STYLE_FILTER}"
"--write-success=${CUR_GEN}" ${filename}
DEPENDS ${filename} ${PROJ_ROOT}/paddle/scripts/cpplint.py
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
add_custom_target(${base_filename}.cpplint DEPENDS ${CUR_GEN})
add_dependencies(${TARGET_NAME} ${base_filename}.cpplint)
endif()
endforeach()
endif()
......
......@@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
ExternalProject_Add(
extern_gflags
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/gflags/gflags.git"
# TODO(yiwang): The annoying warnings mentioned in
# https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
# gflags. I fired a PR https://github.com/gflags/gflags/pull/230
# to fix it. Before it gets accepted by the gflags team, we use
# my personal fork, which contains above fix, temporarily. Let's
# change this back to the official Github repo once my PR is
# merged.
GIT_REPOSITORY "https://github.com/wangkuiyi/gflags.git"
PREFIX ${GFLAGS_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
......@@ -24,7 +24,6 @@ IF(WITH_PYTHON)
ENDIF(WITH_PYTHON)
SET(py_env "")
SET(USE_VIRTUALENV_FOR_TEST 1)
IF(PYTHONINTERP_FOUND)
find_python_module(pip REQUIRED)
find_python_module(numpy REQUIRED)
......
......@@ -187,7 +187,13 @@ function(cc_library TARGET_NAME)
endif()
# cpplint code style
add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
foreach(source_file ${cc_library_SRCS})
string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
else(cc_library_SRCS)
if (cc_library_DEPS)
......@@ -239,6 +245,14 @@ function(nv_library TARGET_NAME)
add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
endif()
# cpplint code style
foreach(source_file ${nv_library_SRCS})
string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
else(nv_library_SRCS)
if (nv_library_DEPS)
merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
......
......@@ -118,7 +118,6 @@ endfunction()
macro(add_unittest_without_exec TARGET_NAME)
add_executable(${TARGET_NAME} ${ARGN})
link_paddle_test(${TARGET_NAME})
add_style_check_target(${TARGET_NAME} ${ARGN})
endmacro()
# add_unittest
......@@ -150,9 +149,12 @@ endfunction()
# Create a python unittest using run_python_tests.sh,
# which takes care of making correct running environment
function(add_python_test TEST_NAME)
add_test(NAME ${TEST_NAME}
COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR}
bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh
${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
foreach(arg ${ARGN})
get_filename_component(py_fn ${arg} NAME_WE)
set(TRG_NAME ${TEST_NAME}_${py_fn})
add_test(NAME ${TRG_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
python2 ${arg}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endforeach()
endfunction()
......@@ -21,22 +21,15 @@
#
# It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
#
if ! python -c "import paddle" >/dev/null 2>/dev/null; then
PYPATH=""
set -x
while getopts "d:" opt; do
case $opt in
d)
PYPATH=$OPTARG
;;
esac
done
shift $(($OPTIND - 1))
export PYTHONPATH=$PYPATH:$PYTHONPATH
$@
else
echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'"
exit 1
fi
PYPATH=""
set -x
while getopts "d:" opt; do
case $opt in
d)
PYPATH=$OPTARG
;;
esac
done
shift $(($OPTIND - 1))
export PYTHONPATH=$PYPATH:$PYTHONPATH
$@
......@@ -15,7 +15,6 @@ if(Boost_FOUND)
add_subdirectory(platform)
add_subdirectory(framework)
add_subdirectory(operators)
add_subdirectory(pybind)
endif()
if(WITH_C_API)
......
......@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_batch_transpose.h"
#include "hl_base.h"
#include "hl_batch_transpose.h"
const int TILE_DIM = 64;
const int BLOCK_ROWS = 16;
// No bank-conflict transpose for a batch of data.
__global__ void batchTransposeNoBankConflicts(real* odata,
const real* idata,
int numSamples, int width,
int height) {
__global__ void batchTransposeNoBankConflicts(
real* odata, const real* idata, int numSamples, int width, int height) {
__shared__ float tile[TILE_DIM][TILE_DIM + 1];
const int x = blockIdx.x * TILE_DIM + threadIdx.x;
......@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
newX] = tile[threadIdx.x][j];
}
void batchTranspose(const real* input, real* output, int width, int height,
int batchSize) {
void batchTranspose(
const real* input, real* output, int width, int height, int batchSize) {
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
(output, input, batchSize, width, height);
batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
output, input, batchSize, width, height);
CHECK_SYNC("batchTranspose failed!");
}
......@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_aggregate.h"
#include "hl_base.h"
#include "hl_cuda.h"
#include "hl_cuda.ph"
#include "hl_aggregate.h"
#include "hl_thread.ph"
#include "hl_matrix_base.cuh"
#include "hl_thread.ph"
#include "paddle/utils/Logging.h"
/**
* @brief matrix row operator.
*/
template<class Agg, int blockSize>
__global__ void KeMatrixRowOp(Agg agg,
real *E,
real *Sum,
int dimN) {
template <class Agg, int blockSize>
__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
__shared__ real sum_s[blockSize];
int cnt = (dimN + blockSize -1) / blockSize;
int rowId = blockIdx.x + blockIdx.y*gridDim.x;
int index = rowId*dimN;
int cnt = (dimN + blockSize - 1) / blockSize;
int rowId = blockIdx.x + blockIdx.y * gridDim.x;
int index = rowId * dimN;
int tid = threadIdx.x;
int lmt = tid;
......@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
sum_s[tid] = tmp;
__syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) {
for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) {
sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
}
......@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
}
template <class Agg>
void hl_matrix_row_op(Agg agg,
real *A_d,
real *C_d,
int dimM,
int dimN) {
void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
int blocksX = dimM;
int blocksY = 1;
dim3 threads(128, 1);
dim3 grid(blocksX, blocksY);
KeMatrixRowOp<Agg, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
(agg, A_d, C_d, dimN);
KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
agg, A_d, C_d, dimN);
}
void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::sum(),
A_d,
C_d,
dimM,
dimN);
hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
CHECK_SYNC("hl_matrix_row_sum failed");
}
......@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::max(),
A_d,
C_d,
dimM,
dimN);
hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
CHECK_SYNC("hl_matrix_row_max failed");
}
......@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::min(),
A_d,
C_d,
dimM,
dimN);
hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
CHECK_SYNC("hl_matrix_row_min failed");
}
/**
* @brief matrix column operator.
*/
template<class Agg>
__global__ void KeMatrixColumnOp(Agg agg,
real *E,
real *Sum,
int dimM,
int dimN) {
template <class Agg>
__global__ void KeMatrixColumnOp(
Agg agg, real *E, real *Sum, int dimM, int dimN) {
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
real tmp = agg.init();
if (rowIdx < dimN) {
......@@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg,
}
}
template<class Agg, int blockDimX, int blockDimY>
__global__ void KeMatrixColumnOp_S(Agg agg,
real *E,
real *Sum,
int dimM,
int dimN) {
__shared__ real _sum[blockDimX*blockDimY];
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int index = threadIdx.y;
template <class Agg, int blockDimX, int blockDimY>
__global__ void KeMatrixColumnOp_S(
Agg agg, real *E, real *Sum, int dimM, int dimN) {
__shared__ real _sum[blockDimX * blockDimY];
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int index = threadIdx.y;
real tmp = agg.init();
if (rowIdx < dimN) {
......@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
index += blockDimY;
}
}
_sum[threadIdx.x + threadIdx.y*blockDimX] = tmp;
_sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
__syncthreads();
if (rowIdx < dimN) {
if (threadIdx.y ==0) {
if (threadIdx.y == 0) {
real tmp = agg.init();
for (int i=0; i < blockDimY; i++) {
tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]);
for (int i = 0; i < blockDimY; i++) {
tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
}
Sum[rowIdx] = tmp;
}
......@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
}
template <class Agg>
void hl_matrix_column_op(Agg agg,
real *A_d,
real *C_d,
int dimM,
int dimN) {
void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
if (dimN >= 8192) {
int blocksX = (dimN + 128 -1) / 128;
int blocksX = (dimN + 128 - 1) / 128;
int blocksY = 1;
dim3 threads(128, 1);
dim3 grid(blocksX, blocksY);
KeMatrixColumnOp<Agg><<< grid, threads, 0, STREAM_DEFAULT >>>
(agg, A_d, C_d, dimM, dimN);
KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
agg, A_d, C_d, dimM, dimN);
} else {
int blocksX = (dimN + 32 -1) / 32;
int blocksX = (dimN + 32 - 1) / 32;
int blocksY = 1;
dim3 threads(32, 32);
dim3 grid(blocksX, blocksY);
KeMatrixColumnOp_S<Agg, 32, 32><<< grid, threads, 0, STREAM_DEFAULT>>>
(agg, A_d, C_d, dimM, dimN);
KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
agg, A_d, C_d, dimM, dimN);
}
return;
......@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::sum(),
A_d,
C_d,
dimM,
dimN);
hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
CHECK_SYNC("hl_matrix_column_sum failed");
}
......@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::max(),
A_d,
C_d,
dimM,
dimN);
hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
CHECK_SYNC("hl_matrix_column_max failed");
}
......@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::min(),
A_d,
C_d,
dimM,
dimN);
hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
CHECK_SYNC("hl_matrix_column_min failed");
}
......@@ -226,16 +184,16 @@ template <int blockSize>
__global__ void KeVectorSum(real *E, real *Sum, int dimM) {
__shared__ double sum_s[blockSize];
int tid = threadIdx.x;
int index = blockIdx.y*blockDim.x+threadIdx.x;
int index = blockIdx.y * blockDim.x + threadIdx.x;
sum_s[tid] = 0.0f;
while (index < dimM) {
sum_s[tid] += E[index];
index += blockDim.x*gridDim.y;
index += blockDim.x * gridDim.y;
}
__syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) {
for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) {
sum_s[tid] += sum_s[tid + stride];
}
......@@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
dim3 threads(blockSize, 1);
dim3 grid(blocksX, blocksY);
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
hl_event_t hl_event = &hl_event_st;
while (!hl_cuda_event_is_ready(hl_event)) {}
while (!hl_cuda_event_is_ready(hl_event)) {
}
KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
(A_d, t_resource.gpu_mem, dimM);
KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
(t_resource.gpu_mem, t_resource.cpu_mem, 128);
KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
A_d, t_resource.gpu_mem, dimM);
KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
t_resource.gpu_mem, t_resource.cpu_mem, 128);
hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
cudaError_t err = (cudaError_t)hl_get_device_last_error();
CHECK_EQ(cudaSuccess, err)
<< "CUDA error: " << hl_get_device_error_string((size_t)err);
CHECK_EQ(cudaSuccess, err) << "CUDA error: "
<< hl_get_device_error_string((size_t)err);
}
template <int blockSize>
__global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
__shared__ double sum_s[blockSize];
int tid = threadIdx.x;
int index = blockIdx.y*blockDim.x+threadIdx.x;
int index = blockIdx.y * blockDim.x + threadIdx.x;
sum_s[tid] = 0.0f;
while (index < dimM) {
sum_s[tid] += abs(E[index]);
index += blockDim.x*gridDim.y;
index += blockDim.x * gridDim.y;
}
__syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) {
for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) {
sum_s[tid] += sum_s[tid + stride];
}
......@@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
dim3 threads(blockSize, 1);
dim3 grid(blocksX, blocksY);
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
hl_event_t hl_event = &hl_event_st;
while (!hl_cuda_event_is_ready(hl_event)) {}
while (!hl_cuda_event_is_ready(hl_event)) {
}
KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
(A_d, t_resource.gpu_mem, dimM);
KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
(t_resource.gpu_mem, t_resource.cpu_mem, 128);
KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
A_d, t_resource.gpu_mem, dimM);
KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
t_resource.gpu_mem, t_resource.cpu_mem, 128);
hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
cudaError_t err = (cudaError_t)hl_get_device_last_error();
CHECK_EQ(cudaSuccess, err)
<< "CUDA error: " << hl_get_device_error_string((size_t)err);
CHECK_EQ(cudaSuccess, err) << "CUDA error: "
<< hl_get_device_error_string((size_t)err);
}
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -16,36 +16,36 @@ limitations under the License. */
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h"
__global__ void KeMaxSequenceForward(real *input,
const int *sequence,
__global__ void KeMaxSequenceForward(real* input,
const int* sequence,
real* output,
int *index,
int* index,
int numSequences,
int dim) {
int dimIdx = threadIdx.x;
int sequenceId = blockIdx.x;
if (sequenceId >= numSequences) return;
int start = sequence[sequenceId];
int end = sequence[sequenceId+1];
int end = sequence[sequenceId + 1];
for (int i = dimIdx; i < dim; i += blockDim.x) {
real tmp = -HL_FLOAT_MAX;
int tmpId = -1;
for (int insId = start; insId < end; insId++) {
if (tmp < input[insId*dim + i]) {
tmp = input[insId*dim + i];
if (tmp < input[insId * dim + i]) {
tmp = input[insId * dim + i];
tmpId = insId;
}
}
output[sequenceId*dim + i] = tmp;
index[sequenceId*dim + i] = tmpId;
output[sequenceId * dim + i] = tmp;
index[sequenceId * dim + i] = tmpId;
}
}
void hl_max_sequence_forward(real* input,
const int* sequence,
real* output,
int *index,
int* index,
int numSequences,
int dim) {
CHECK_NOTNULL(input);
......@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
dim3 threads(256, 1);
dim3 grid(numSequences, 1);
KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>>
(input, sequence, output, index, numSequences, dim);
KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
input, sequence, output, index, numSequences, dim);
CHECK_SYNC("hl_max_sequence_forward failed");
}
__global__ void KeMaxSequenceBackward(real *outputGrad,
int *index,
real* inputGrad,
int numSequences,
int dim) {
__global__ void KeMaxSequenceBackward(
real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int colIdx = idx % dim;
if (idx < numSequences*dim) {
if (idx < numSequences * dim) {
int insId = index[idx];
inputGrad[insId * dim + colIdx] += outputGrad[idx];
}
}
void hl_max_sequence_backward(real* outputGrad,
int *index,
real* inputGrad,
int numSequences,
int dim) {
void hl_max_sequence_backward(
real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
CHECK_NOTNULL(outputGrad);
CHECK_NOTNULL(index);
CHECK_NOTNULL(inputGrad);
......@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
dim3 threads(128, 1);
dim3 grid(blocks, 1);
KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>>
(outputGrad, index, inputGrad, numSequences, dim);
KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
outputGrad, index, inputGrad, numSequences, dim);
CHECK_SYNC("hl_max_sequence_backward failed");
}
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output,
real* table,
int* ids,
......@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
while (sampleId < numSamples) {
int tableId = ids[sampleId];
if ((0 <= tableId) && (tableId < tableSize)) {
real *outputData = output + sampleId * dim;
real *tableData = table + tableId * dim;
real* outputData = output + sampleId * dim;
real* tableData = table + tableId * dim;
for (int i = idx; i < dim; i += blockDimX) {
if (AddRow == 0) {
outputData[i] += tableData[i];
......@@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output,
}
}
}
sampleId += blockDimY*gridDimX;
sampleId += blockDimY * gridDimX;
}
}
template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
__global__
void KeSequence2Batch(real *batch,
real *sequence,
const int *batchIndex,
int seqWidth,
int batchCount) {
template <int blockDimX,
int blockDimY,
int gridDimX,
bool seq2batch,
bool isAdd>
__global__ void KeSequence2Batch(real* batch,
real* sequence,
const int* batchIndex,
int seqWidth,
int batchCount) {
int idx = threadIdx.x;
int idy = threadIdx.y;
int id = blockIdx.x + idy * gridDimX;
while (id < batchCount) {
int seqId = batchIndex[id];
real* batchData = batch + id*seqWidth;
real* seqData = sequence + seqId*seqWidth;
real* batchData = batch + id * seqWidth;
real* seqData = sequence + seqId * seqWidth;
for (int i = idx; i < seqWidth; i += blockDimX) {
if (seq2batch) {
if (isAdd) {
......@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
}
}
}
id += blockDimY*gridDimX;
id += blockDimY * gridDimX;
}
}
void hl_sequence2batch_copy(real *batch,
real *sequence,
const int *batchIndex,
void hl_sequence2batch_copy(real* batch,
real* sequence,
const int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch) {
......@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
dim3 threads(128, 8);
dim3 grid(8, 1);
if (seq2batch) {
KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
(batch, sequence, batchIndex, seqWidth, batchCount);
KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, batchIndex, seqWidth, batchCount);
} else {
KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
(batch, sequence, batchIndex, seqWidth, batchCount);
KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, batchIndex, seqWidth, batchCount);
}
CHECK_SYNC("hl_sequence2batch_copy failed");
}
void hl_sequence2batch_add(real *batch,
real *sequence,
int *batchIndex,
void hl_sequence2batch_add(real* batch,
real* sequence,
int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch) {
......@@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch,
dim3 threads(128, 8);
dim3 grid(8, 1);
if (seq2batch) {
KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
(batch, sequence, batchIndex, seqWidth, batchCount);
KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, batchIndex, seqWidth, batchCount);
} else {
KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
(batch, sequence, batchIndex, seqWidth, batchCount);
KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, batchIndex, seqWidth, batchCount);
}
CHECK_SYNC("hl_sequence2batch_add failed");
}
template<bool normByTimes, bool seq2batch>
__global__
void KeSequence2BatchPadding(real* batch,
real* sequence,
const int* sequenceStartPositions,
const size_t sequenceWidth,
const size_t maxSequenceLength,
const size_t numSequences) {
template <bool normByTimes, bool seq2batch>
__global__ void KeSequence2BatchPadding(real* batch,
real* sequence,
const int* sequenceStartPositions,
const size_t sequenceWidth,
const size_t maxSequenceLength,
const size_t numSequences) {
int batchIdx = blockIdx.y;
int sequenceStart = sequenceStartPositions[batchIdx];
int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
......@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
if (seq2batch) {
/* sequence -> batch */
if (normByTimes) {
KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
batch, sequence, sequenceStartPositions,
sequenceWidth, maxSequenceLength, numSequences);
KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch,
sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} else {
KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
batch, sequence, sequenceStartPositions,
sequenceWidth, maxSequenceLength, numSequences);
KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch,
sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
}
} else {
/* batch -> sequence */
if (normByTimes) {
KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
batch, sequence, sequenceStartPositions,
sequenceWidth, maxSequenceLength, numSequences);
KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch,
sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} else {
KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
batch, sequence, sequenceStartPositions,
sequenceWidth, maxSequenceLength, numSequences);
KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch,
sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
}
}
CHECK_SYNC("hl_sequence2batch_copy_padding failed");
}
__device__ inline float my_rsqrt(float x) {
return rsqrtf(x);
}
__device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
__device__ inline double my_rsqrt(double x) {
return rsqrt(x);
}
__device__ inline double my_rsqrt(double x) { return rsqrt(x); }
__global__ void KeSequenceAvgForward(real* dst,
real* src,
......@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
for (int i = start; i < end; i++) {
sum += src[i * width + col];
}
sum = mode == 1 ? sum :
(mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
: sum * my_rsqrt((real)seqLength));
dst[gid] += sum;
}
}
......@@ -347,10 +355,10 @@ void hl_sequence_avg_forward(real* dst,
int grid = DIVUP(width * height, 512);
CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_forward!";
<< "mode error in hl_sequence_avg_forward!";
KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>>
(dst, src, starts, height, width, mode);
KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_forward failed");
}
......@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
int seqLength = end - start;
if (seqLength == 0) return;
real grad = src[gid];
grad = mode == 1 ? grad :
(mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength));
grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
: grad * my_rsqrt((real)seqLength));
for (int i = start; i < end; i++) {
dst[i * width + col] += grad;
}
......@@ -392,9 +400,9 @@ void hl_sequence_avg_backward(real* dst,
int grid = DIVUP(width * height, 512);
CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_backward!";
<< "mode error in hl_sequence_avg_backward!";
KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>>
(dst, src, starts, height, width, mode);
KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_backward failed");
}
此差异已折叠。
......@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cmath>
#include <stdlib.h>
#include "hl_cuda.h"
#include "hl_time.h"
#include <cmath>
#include "hl_base.h"
#include "hl_cuda.h"
#include "hl_perturbation_util.cuh"
#include "hl_time.h"
#define _USE_MATH_DEFINES
......@@ -30,10 +29,16 @@ limitations under the License. */
* centerX, centerY: translation.
* sourceX, sourceY: output coordinates in the original image.
*/
__device__ void getTranformCoord(int x, int y, real theta, real scale,
real tgtCenter, real imgCenter,
real centerR, real centerC,
int* sourceX, int* sourceY) {
__device__ void getTranformCoord(int x,
int y,
real theta,
real scale,
real tgtCenter,
real imgCenter,
real centerR,
real centerC,
int* sourceX,
int* sourceY) {
real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
// compute coornidates in the rotated and scaled image
......@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
* created by Wei Xu (genome), converted by Jiang Wang
*/
__global__ void kSamplingPatches(const real* imgs, real* targets,
int imgSize, int tgtSize, const int channels,
int samplingRate, const real* thetas,
const real* scales, const int* centerRs,
const int* centerCs, const real padValue,
__global__ void kSamplingPatches(const real* imgs,
real* targets,
int imgSize,
int tgtSize,
const int channels,
int samplingRate,
const real* thetas,
const real* scales,
const int* centerRs,
const int* centerCs,
const real padValue,
const int numImages) {
const int caseIdx = blockIdx.x * 4 + threadIdx.x;
const int pxIdx = blockIdx.y * 128 + threadIdx.y;
......@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
const int pxY = pxIdx / tgtSize;
int srcPxX, srcPxY;
getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter,
imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX,
getTranformCoord(pxX,
pxY,
thetas[imgIdx],
scales[imgIdx],
tgtCenter,
imgCenter,
centerCs[caseIdx],
centerRs[caseIdx],
&srcPxX,
&srcPxY);
imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
......@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
*
* created by Wei Xu
*/
void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
int*& gpuCenterR, int*& gpuCenterC,
int numImages, int imgSize, real rotateAngle,
real scaleRatio, int samplingRate,
void hl_generate_disturb_params(real*& gpuAngle,
real*& gpuScaleRatio,
int*& gpuCenterR,
int*& gpuCenterC,
int numImages,
int imgSize,
real rotateAngle,
real scaleRatio,
int samplingRate,
bool isTrain) {
// The number of output samples.
int numPatches = numImages * samplingRate;
......@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
for (int i = 0; i < numImages; i++) {
r_angle[i] =
(rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT
- 0.5);
-
0.5);
s_ratio[i] =
1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT
}
......@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
int pxY =
(int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT
const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]),
sin(-r_angle[i]), cos(-r_angle[i])};
const real H[4] = {cos(-r_angle[i]),
-sin(-r_angle[i]),
sin(-r_angle[i]),
cos(-r_angle[i])};
real x = pxX - imgCenter;
real y = pxY - imgCenter;
real xx = H[0] * x + H[1] * y;
......@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
delete[] center_c;
}
void hl_conv_random_disturb_with_params(const real* images, int imgSize,
int tgtSize, int channels,
int numImages, int samplingRate,
void hl_conv_random_disturb_with_params(const real* images,
int imgSize,
int tgtSize,
int channels,
int numImages,
int samplingRate,
const real* gpuRotationAngle,
const real* gpuScaleRatio,
const int* gpuCenterR,
......@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
dim3 threadsPerBlock(4, 128);
dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
kSamplingPatches <<<numBlocks, threadsPerBlock>>>
(images, target, imgSize, tgtSize, channels, samplingRate,
gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC,
paddingValue, numImages);
kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
target,
imgSize,
tgtSize,
channels,
samplingRate,
gpuRotationAngle,
gpuScaleRatio,
gpuCenterR,
gpuCenterC,
paddingValue,
numImages);
hl_device_synchronize();
}
void hl_conv_random_disturb(const real* images, int imgSize,
int tgtSize, int channels, int numImages,
real scaleRatio, real rotateAngle,
int samplingRate, real* gpu_r_angle,
real* gpu_s_ratio, int* gpu_center_r,
int* gpu_center_c, int paddingValue,
bool isTrain, real* targets) {
void hl_conv_random_disturb(const real* images,
int imgSize,
int tgtSize,
int channels,
int numImages,
real scaleRatio,
real rotateAngle,
int samplingRate,
real* gpu_r_angle,
real* gpu_s_ratio,
int* gpu_center_r,
int* gpu_center_c,
int paddingValue,
bool isTrain,
real* targets) {
// generate the random disturbance sequence and the sampling locations
hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r,
gpu_center_c, numImages, imgSize, rotateAngle,
scaleRatio, samplingRate, isTrain);
hl_conv_random_disturb_with_params(
images, imgSize, tgtSize, channels, numImages,
samplingRate, gpu_r_angle, gpu_s_ratio,
gpu_center_r, gpu_center_r, paddingValue,
targets);
hl_generate_disturb_params(gpu_r_angle,
gpu_s_ratio,
gpu_center_r,
gpu_center_c,
numImages,
imgSize,
rotateAngle,
scaleRatio,
samplingRate,
isTrain);
hl_conv_random_disturb_with_params(images,
imgSize,
tgtSize,
channels,
numImages,
samplingRate,
gpu_r_angle,
gpu_s_ratio,
gpu_center_r,
gpu_center_r,
paddingValue,
targets);
}
......@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_cuda.h"
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h"
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output, int ldo,
real* table, int ldt,
template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output,
int ldo,
real* table,
int ldt,
int* ids,
int numSamples,
int tableSize,
......@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
while (idy < numSamples) {
int tableId = ids[idy];
if ((0 <= tableId) && (tableId < tableSize)) {
real *out = output + idy * ldo;
real *tab = table + tableId * ldt;
real* out = output + idy * ldo;
real* tab = table + tableId * ldt;
for (int i = idx; i < dim; i += blockDimX) {
if (AddRow) {
paddle::paddleAtomicAdd(&tab[i], out[i]);
......@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
}
}
void hl_matrix_select_rows(real* output, int ldo,
real* table, int ldt,
void hl_matrix_select_rows(real* output,
int ldo,
real* table,
int ldt,
int* ids,
int numSamples,
int tableSize,
......@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
dim3 threads(128, 8);
dim3 grid(8, 1);
KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
(output, ldo, table, ldt, ids, numSamples, tableSize, dim);
KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
output, ldo, table, ldt, ids, numSamples, tableSize, dim);
CHECK_SYNC("hl_matrix_select_rows failed");
}
void hl_matrix_add_to_rows(real* table, int ldt,
real* input, int ldi,
void hl_matrix_add_to_rows(real* table,
int ldt,
real* input,
int ldi,
int* ids,
int numSamples,
int tableSize,
......@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
dim3 threads(128, 8);
dim3 grid(8, 1);
KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
(input, ldi, table, ldt, ids, numSamples, tableSize, dim);
KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
input, ldi, table, ldt, ids, numSamples, tableSize, dim);
CHECK_SYNC("hl_matrix_add_to_rows failed");
}
template<class T, int blockDimX, int gridDimX>
__global__ void KeVectorSelect(T* dst, int sized,
const T* src, int sizes,
const int* ids, int sizei) {
template <class T, int blockDimX, int gridDimX>
__global__ void KeVectorSelect(
T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
int idx = threadIdx.x + blockDimX * blockIdx.x;
while (idx < sizei) {
int index = ids[idx];
......@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
}
template <class T>
void hl_vector_select_from(T* dst, int sized,
const T* src, int sizes,
const int* ids, int sizei) {
void hl_vector_select_from(
T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
CHECK_NOTNULL(dst);
CHECK_NOTNULL(src);
CHECK_NOTNULL(ids);
......@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
dim3 threads(512, 1);
dim3 grid(8, 1);
KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>>
(dst, sized, src, sizes, ids, sizei);
KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
dst, sized, src, sizes, ids, sizei);
CHECK_SYNC("hl_vector_select_from failed");
}
template
void hl_vector_select_from(real* dst, int sized,
const real* src, int sizes,
const int* ids, int sizei);
template
void hl_vector_select_from(int* dst, int sized,
const int* src, int sizes,
const int* ids, int sizei);
template void hl_vector_select_from(real* dst,
int sized,
const real* src,
int sizes,
const int* ids,
int sizei);
template void hl_vector_select_from(
int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
......@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_base.h"
#include "hl_top_k.h"
#include "hl_sparse.ph"
#include "hl_top_k.h"
#include "paddle/utils/Logging.h"
// using namespace hppl;
struct Pair {
__device__ __forceinline__
Pair() {}
__device__ __forceinline__ Pair() {}
__device__ __forceinline__
Pair(real value, int id) : v_(value), id_(id) {}
__device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
__device__ __forceinline__
void set(real value, int id) {
__device__ __forceinline__ void set(real value, int id) {
v_ = value;
id_ = id;
}
__device__ __forceinline__
void operator=(const Pair& in) {
__device__ __forceinline__ void operator=(const Pair& in) {
v_ = in.v_;
id_ = in.id_;
}
__device__ __forceinline__
bool operator<(const real value) const {
__device__ __forceinline__ bool operator<(const real value) const {
return (v_ < value);
}
__device__ __forceinline__
bool operator<(const Pair& in) const {
__device__ __forceinline__ bool operator<(const Pair& in) const {
return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
}
__device__ __forceinline__
bool operator>(const Pair& in) const {
__device__ __forceinline__ bool operator>(const Pair& in) const {
return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
}
......@@ -58,8 +50,9 @@ struct Pair {
int id_;
};
__device__ __forceinline__
void addTo(Pair topK[], const Pair &p, int beamSize) {
__device__ __forceinline__ void addTo(Pair topK[],
const Pair& p,
int beamSize) {
for (int k = beamSize - 2; k >= 0; k--) {
if (topK[k] < p) {
topK[k + 1] = topK[k];
......@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
topK[0] = p;
}
template<int beamSize>
__device__ __forceinline__
void addTo(Pair topK[], const Pair &p) {
template <int beamSize>
__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
for (int k = beamSize - 2; k >= 0; k--) {
if (topK[k] < p) {
topK[k + 1] = topK[k];
......@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
topK[0] = p;
}
template<int blockSize>
__device__ __forceinline__
void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
template <int blockSize>
__device__ __forceinline__ void getTopK(
Pair topK[], real* src, int idx, int dim, int beamSize) {
while (idx < dim) {
if (topK[beamSize - 1] < src[idx]) {
Pair tmp(src[idx], idx);
......@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
}
}
template<int blockSize>
__device__ __forceinline__
void getTopK(Pair topK[], real *src, int idx, int dim,
const Pair& max, int beamSize) {
template <int blockSize>
__device__ __forceinline__ void getTopK(
Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
while (idx < dim) {
if (topK[beamSize - 1] < src[idx]) {
Pair tmp(src[idx], idx);
......@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
}
}
template<int blockSize>
__device__ __forceinline__
void getTopK(Pair topK[], real *val, int *col,
int idx, int dim, int beamSize) {
template <int blockSize>
__device__ __forceinline__ void getTopK(
Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
while (idx < dim) {
if (topK[beamSize - 1] < val[idx]) {
Pair tmp(val[idx], col[idx]);
......@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
}
}
template<int blockSize>
__device__ __forceinline__
void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
const Pair& max, int beamSize) {
template <int blockSize>
__device__ __forceinline__ void getTopK(Pair topK[],
real* val,
int* col,
int idx,
int dim,
const Pair& max,
int beamSize) {
while (idx < dim) {
if (topK[beamSize - 1] < val[idx]) {
Pair tmp(val[idx], col[idx]);
......@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
}
}
template<int maxLength, int blockSize>
__device__ __forceinline__
void threadGetTopK(Pair topK[], int& beam, int beamSize,
real* src,
bool& firstStep, bool& isEmpty, Pair& max,
int dim, const int tid) {
template <int maxLength, int blockSize>
__device__ __forceinline__ void threadGetTopK(Pair topK[],
int& beam,
int beamSize,
real* src,
bool& firstStep,
bool& isEmpty,
Pair& max,
int dim,
const int tid) {
if (beam > 0) {
int length = beam < beamSize ? beam : beamSize;
if (firstStep) {
......@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
if (!isEmpty) {
getTopK<blockSize>(topK + maxLength - beam, src, tid, dim,
max, length);
getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
}
}
......@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
template<int maxLength, int blockSize>
__device__ __forceinline__
void threadGetTopK(Pair topK[], int& beam, int beamSize,
real* val, int* col,
bool& firstStep, bool& isEmpty, Pair& max,
int dim, const int tid) {
template <int maxLength, int blockSize>
__device__ __forceinline__ void threadGetTopK(Pair topK[],
int& beam,
int beamSize,
real* val,
int* col,
bool& firstStep,
bool& isEmpty,
Pair& max,
int dim,
const int tid) {
if (beam > 0) {
int length = beam < beamSize ? beam : beamSize;
if (firstStep) {
......@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
if (!isEmpty) {
getTopK<blockSize>(topK + maxLength - beam, val, col, tid, dim,
max, length);
getTopK<blockSize>(
topK + maxLength - beam, val, col, tid, dim, max, length);
}
}
......@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
template<int maxLength, int blockSize>
__device__ __forceinline__
void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
real** topVal, int** topIds,
int& beam, int& beamSize,
const int tid, const int warp) {
template <int maxLength, int blockSize>
__device__ __forceinline__ void blockReduce(Pair* shTopK,
int* maxId,
Pair topK[],
real** topVal,
int** topIds,
int& beam,
int& beamSize,
const int tid,
const int warp) {
while (true) {
__syncthreads();
if (tid < blockSize / 2) {
......@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
}
}
__syncthreads();
for (int stride = blockSize / 4; stride > 0; stride = stride/2) {
for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
if (tid < stride) {
if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
maxId[tid] = maxId[tid + stride];
......@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
* 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value.
*/
template<int maxLength, int blockSize>
__global__ void KeMatrixTopK(real* topVal, int ldv,
int * topIds,
real* src, int lds,
template <int maxLength, int blockSize>
__global__ void KeMatrixTopK(real* topVal,
int ldv,
int* topIds,
real* src,
int lds,
int dim,
int beamSize) {
__shared__ Pair shTopK[blockSize];
......@@ -271,7 +279,7 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
topVal += blockIdx.x * ldv;
topIds += blockIdx.x * beamSize;
Pair topK[maxLength]; // NOLINT
Pair topK[maxLength]; // NOLINT
int beam = maxLength;
Pair max;
bool isEmpty = false;
......@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
topK[k].set(-HL_FLOAT_MAX, -1);
}
while (beamSize) {
threadGetTopK<maxLength, blockSize>
(topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
threadGetTopK<maxLength, blockSize>(
topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize>
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
blockReduce<maxLength, blockSize>(
shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
}
}
template<int maxLength, int blockSize>
__global__ void KeSMatrixTopK(real* topVal, int ldv,
int * topIds,
template <int maxLength, int blockSize>
__global__ void KeSMatrixTopK(real* topVal,
int ldv,
int* topIds,
real* val,
int* row,
int* col,
......@@ -304,7 +313,7 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
topVal += blockIdx.x * ldv;
topIds += blockIdx.x * beamSize;
Pair topK[maxLength]; // NOLINT
Pair topK[maxLength]; // NOLINT
int beam = maxLength;
Pair max;
bool isEmpty = false;
......@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
topK[k].set(-HL_FLOAT_MAX, -1);
}
while (beamSize) {
threadGetTopK<maxLength, blockSize>
(topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
threadGetTopK<maxLength, blockSize>(
topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize>
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
blockReduce<maxLength, blockSize>(
shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
}
}
void hl_matrix_top_k(real* topVal, int ldv,
int * topIds,
real* src, int lds,
void hl_matrix_top_k(real* topVal,
int ldv,
int* topIds,
real* src,
int lds,
int dim,
int beamSize,
int numSamples) {
......@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
dim3 threads(256, 1);
dim3 grid(numSamples, 1);
KeMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
(topVal, ldv, topIds, src, lds, dim, beamSize);
KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
topVal, ldv, topIds, src, lds, dim, beamSize);
CHECK_SYNC("hl_matrix_top_k failed");
}
void hl_sparse_matrix_top_k(real* topVal, int ldv,
int * topIds,
void hl_sparse_matrix_top_k(real* topVal,
int ldv,
int* topIds,
hl_sparse_matrix_s src,
int beamSize,
int numSamples) {
CHECK_NOTNULL(topVal);
CHECK_NOTNULL(topIds);
CHECK_NOTNULL(src);
CHECK_EQ(src->format, HL_SPARSE_CSR)
<<"sparse matrix format error!";
CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
if (csr->csr_val == NULL || csr->csr_row == NULL ||
csr->csr_col == NULL) {
if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
LOG(FATAL) << "parameter src is null!";
}
dim3 threads(256, 1);
dim3 grid(numSamples, 1);
KeSMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
(topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
CHECK_SYNC("hl_sparse_matrix_top_k failed");
}
......@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
* 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value.
*/
template<int maxLength, int blockSize>
__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
int * topIds,
real* src, int lds,
template <int maxLength, int blockSize>
__global__ void KeMatrixTopKClassificationError(real* topVal,
int ldv,
int* topIds,
real* src,
int lds,
int dim,
int beamSize,
int* label,
......@@ -408,7 +420,7 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
topVal += blockIdx.x * ldv;
topIds += blockIdx.x * beamSize;
Pair topK[maxLength]; // NOLINT
Pair topK[maxLength]; // NOLINT
int beam = maxLength;
Pair max;
bool isEmpty = false;
......@@ -420,34 +432,36 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
}
while (beamSize) {
threadGetTopK<maxLength, blockSize>
(topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
threadGetTopK<maxLength, blockSize>(
topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize>
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
blockReduce<maxLength, blockSize>(
shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
}
__syncthreads();
if (tid == 0) {
for (int i = 0; i < topkSize; i++) {
if (*--topIds == label[blockIdx.x]) {
recResult[blockIdx.x] = 0;
break;
}
recResult[blockIdx.x] = 1.0f;
if (*--topIds == label[blockIdx.x]) {
recResult[blockIdx.x] = 0;
break;
}
recResult[blockIdx.x] = 1.0f;
}
}
}
void hl_matrix_classification_error(real* topVal, int ldv,
int* topIds,
real* src, int lds,
int dim,
int topkSize,
int numSamples,
int* label,
real* recResult) {
void hl_matrix_classification_error(real* topVal,
int ldv,
int* topIds,
real* src,
int lds,
int dim,
int topkSize,
int numSamples,
int* label,
real* recResult) {
CHECK_NOTNULL(topVal);
CHECK_NOTNULL(topIds);
CHECK_NOTNULL(src);
......@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
dim3 threads(256, 1);
dim3 grid(numSamples, 1);
KeMatrixTopKClassificationError<5, 256>
<<< grid, threads, 0, STREAM_DEFAULT >>>
(topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
CHECK_SYNC("hl_matrix_top_k classification error failed");
}
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -25,18 +25,15 @@ limitations under the License. */
namespace paddle {
namespace framework {
namespace {
typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
Dim<8>, Dim<9>>
DDimVar;
}
/**
* \brief A dynamically sized dimension.
*
* The number of dimensions must be between [1, 9].
*/
struct DDim {
typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
Dim<8>, Dim<9>>
DDimVar;
DDimVar var;
DDim() : var(Dim<1>()) {}
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册