提交 1d4fa243 编写于 作者: L liaogang

ClangFormat for proto and cuda

上级 6512893b
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
description: Format files with ClangFormat. description: Format files with ClangFormat.
entry: clang-format -i entry: clang-format -i
language: system language: system
files: \.(c|cc|cxx|cpp|h|hpp|hxx)$ files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang - repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks: hooks:
......
...@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_batch_transpose.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_batch_transpose.h"
const int TILE_DIM = 64; const int TILE_DIM = 64;
const int BLOCK_ROWS = 16; const int BLOCK_ROWS = 16;
// No bank-conflict transpose for a batch of data. // No bank-conflict transpose for a batch of data.
__global__ void batchTransposeNoBankConflicts(real* odata, __global__ void batchTransposeNoBankConflicts(
const real* idata, real* odata, const real* idata, int numSamples, int width, int height) {
int numSamples, int width,
int height) {
__shared__ float tile[TILE_DIM][TILE_DIM + 1]; __shared__ float tile[TILE_DIM][TILE_DIM + 1];
const int x = blockIdx.x * TILE_DIM + threadIdx.x; const int x = blockIdx.x * TILE_DIM + threadIdx.x;
...@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata, ...@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
newX] = tile[threadIdx.x][j]; newX] = tile[threadIdx.x][j];
} }
void batchTranspose(const real* input, real* output, int width, int height, void batchTranspose(
int batchSize) { const real* input, real* output, int width, int height, int batchSize) {
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize); dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>> batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
(output, input, batchSize, width, height); output, input, batchSize, width, height);
CHECK_SYNC("batchTranspose failed!"); CHECK_SYNC("batchTranspose failed!");
} }
...@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_aggregate.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_cuda.h" #include "hl_cuda.h"
#include "hl_cuda.ph" #include "hl_cuda.ph"
#include "hl_aggregate.h"
#include "hl_thread.ph"
#include "hl_matrix_base.cuh" #include "hl_matrix_base.cuh"
#include "hl_thread.ph"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
/** /**
* @brief matrix row operator. * @brief matrix row operator.
*/ */
template<class Agg, int blockSize> template <class Agg, int blockSize>
__global__ void KeMatrixRowOp(Agg agg, __global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
real *E,
real *Sum,
int dimN) {
__shared__ real sum_s[blockSize]; __shared__ real sum_s[blockSize];
int cnt = (dimN + blockSize -1) / blockSize; int cnt = (dimN + blockSize - 1) / blockSize;
int rowId = blockIdx.x + blockIdx.y*gridDim.x; int rowId = blockIdx.x + blockIdx.y * gridDim.x;
int index = rowId*dimN; int index = rowId * dimN;
int tid = threadIdx.x; int tid = threadIdx.x;
int lmt = tid; int lmt = tid;
...@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg, ...@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
sum_s[tid] = tmp; sum_s[tid] = tmp;
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]); sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
} }
...@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg, ...@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
} }
template <class Agg> template <class Agg>
void hl_matrix_row_op(Agg agg, void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
real *A_d,
real *C_d,
int dimM,
int dimN) {
int blocksX = dimM; int blocksX = dimM;
int blocksY = 1; int blocksY = 1;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixRowOp<Agg, 128><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimN); agg, A_d, C_d, dimN);
} }
void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) { void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::sum(), hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_sum failed"); CHECK_SYNC("hl_matrix_row_sum failed");
} }
...@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::max(), hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_max failed"); CHECK_SYNC("hl_matrix_row_max failed");
} }
...@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::min(), hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_min failed"); CHECK_SYNC("hl_matrix_row_min failed");
} }
/** /**
* @brief matrix column operator. * @brief matrix column operator.
*/ */
template<class Agg> template <class Agg>
__global__ void KeMatrixColumnOp(Agg agg, __global__ void KeMatrixColumnOp(
real *E, Agg agg, real *E, real *Sum, int dimM, int dimN) {
real *Sum,
int dimM,
int dimN) {
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
real tmp = agg.init(); real tmp = agg.init();
if (rowIdx < dimN) { if (rowIdx < dimN) {
...@@ -127,13 +104,10 @@ __global__ void KeMatrixColumnOp(Agg agg, ...@@ -127,13 +104,10 @@ __global__ void KeMatrixColumnOp(Agg agg,
} }
} }
template<class Agg, int blockDimX, int blockDimY> template <class Agg, int blockDimX, int blockDimY>
__global__ void KeMatrixColumnOp_S(Agg agg, __global__ void KeMatrixColumnOp_S(
real *E, Agg agg, real *E, real *Sum, int dimM, int dimN) {
real *Sum, __shared__ real _sum[blockDimX * blockDimY];
int dimM,
int dimN) {
__shared__ real _sum[blockDimX*blockDimY];
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int index = threadIdx.y; int index = threadIdx.y;
...@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg, ...@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
index += blockDimY; index += blockDimY;
} }
} }
_sum[threadIdx.x + threadIdx.y*blockDimX] = tmp; _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
__syncthreads(); __syncthreads();
if (rowIdx < dimN) { if (rowIdx < dimN) {
if (threadIdx.y ==0) { if (threadIdx.y == 0) {
real tmp = agg.init(); real tmp = agg.init();
for (int i=0; i < blockDimY; i++) { for (int i = 0; i < blockDimY; i++) {
tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]); tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
} }
Sum[rowIdx] = tmp; Sum[rowIdx] = tmp;
} }
...@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg, ...@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
} }
template <class Agg> template <class Agg>
void hl_matrix_column_op(Agg agg, void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
real *A_d,
real *C_d,
int dimM,
int dimN) {
if (dimN >= 8192) { if (dimN >= 8192) {
int blocksX = (dimN + 128 -1) / 128; int blocksX = (dimN + 128 - 1) / 128;
int blocksY = 1; int blocksY = 1;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixColumnOp<Agg><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimM, dimN); agg, A_d, C_d, dimM, dimN);
} else { } else {
int blocksX = (dimN + 32 -1) / 32; int blocksX = (dimN + 32 - 1) / 32;
int blocksY = 1; int blocksY = 1;
dim3 threads(32, 32); dim3 threads(32, 32);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixColumnOp_S<Agg, 32, 32><<< grid, threads, 0, STREAM_DEFAULT>>> KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimM, dimN); agg, A_d, C_d, dimM, dimN);
} }
return; return;
...@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::sum(), hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_sum failed"); CHECK_SYNC("hl_matrix_column_sum failed");
} }
...@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::max(), hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_max failed"); CHECK_SYNC("hl_matrix_column_max failed");
} }
...@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::min(), hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_min failed"); CHECK_SYNC("hl_matrix_column_min failed");
} }
...@@ -226,16 +184,16 @@ template <int blockSize> ...@@ -226,16 +184,16 @@ template <int blockSize>
__global__ void KeVectorSum(real *E, real *Sum, int dimM) { __global__ void KeVectorSum(real *E, real *Sum, int dimM) {
__shared__ double sum_s[blockSize]; __shared__ double sum_s[blockSize];
int tid = threadIdx.x; int tid = threadIdx.x;
int index = blockIdx.y*blockDim.x+threadIdx.x; int index = blockIdx.y * blockDim.x + threadIdx.x;
sum_s[tid] = 0.0f; sum_s[tid] = 0.0f;
while (index < dimM) { while (index < dimM) {
sum_s[tid] += E[index]; sum_s[tid] += E[index];
index += blockDim.x*gridDim.y; index += blockDim.x * gridDim.y;
} }
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] += sum_s[tid + stride]; sum_s[tid] += sum_s[tid + stride];
} }
...@@ -261,36 +219,37 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) { ...@@ -261,36 +219,37 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
hl_event_t hl_event = &hl_event_st; hl_event_t hl_event = &hl_event_st;
while (!hl_cuda_event_is_ready(hl_event)) {} while (!hl_cuda_event_is_ready(hl_event)) {
}
KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(A_d, t_resource.gpu_mem, dimM); A_d, t_resource.gpu_mem, dimM);
KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
(t_resource.gpu_mem, t_resource.cpu_mem, 128); t_resource.gpu_mem, t_resource.cpu_mem, 128);
hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
hl_stream_synchronize(HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
cudaError_t err = (cudaError_t)hl_get_device_last_error(); cudaError_t err = (cudaError_t)hl_get_device_last_error();
CHECK_EQ(cudaSuccess, err) CHECK_EQ(cudaSuccess, err) << "CUDA error: "
<< "CUDA error: " << hl_get_device_error_string((size_t)err); << hl_get_device_error_string((size_t)err);
} }
template <int blockSize> template <int blockSize>
__global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) { __global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
__shared__ double sum_s[blockSize]; __shared__ double sum_s[blockSize];
int tid = threadIdx.x; int tid = threadIdx.x;
int index = blockIdx.y*blockDim.x+threadIdx.x; int index = blockIdx.y * blockDim.x + threadIdx.x;
sum_s[tid] = 0.0f; sum_s[tid] = 0.0f;
while (index < dimM) { while (index < dimM) {
sum_s[tid] += abs(E[index]); sum_s[tid] += abs(E[index]);
index += blockDim.x*gridDim.y; index += blockDim.x * gridDim.y;
} }
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] += sum_s[tid + stride]; sum_s[tid] += sum_s[tid + stride];
} }
...@@ -316,18 +275,19 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) { ...@@ -316,18 +275,19 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
hl_event_t hl_event = &hl_event_st; hl_event_t hl_event = &hl_event_st;
while (!hl_cuda_event_is_ready(hl_event)) {} while (!hl_cuda_event_is_ready(hl_event)) {
}
KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(A_d, t_resource.gpu_mem, dimM); A_d, t_resource.gpu_mem, dimM);
KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
(t_resource.gpu_mem, t_resource.cpu_mem, 128); t_resource.gpu_mem, t_resource.cpu_mem, 128);
hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
hl_stream_synchronize(HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
cudaError_t err = (cudaError_t)hl_get_device_last_error(); cudaError_t err = (cudaError_t)hl_get_device_last_error();
CHECK_EQ(cudaSuccess, err) CHECK_EQ(cudaSuccess, err) << "CUDA error: "
<< "CUDA error: " << hl_get_device_error_string((size_t)err); << hl_get_device_error_string((size_t)err);
} }
此差异已折叠。
此差异已折叠。
...@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
#include "hl_matrix.h" #include "hl_matrix.h"
#include "hl_matrix_ops.cuh"
#include "hl_matrix_apply.cuh" #include "hl_matrix_apply.cuh"
#include "hl_matrix_ops.cuh"
#include "hl_sequence.h" #include "hl_sequence.h"
#include "hl_sparse.ph" #include "hl_sparse.ph"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
DEFINE_MATRIX_UNARY_OP(Zero, a = 0); DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b); DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
void hl_matrix_add(real *A_d, void hl_matrix_add(real* A_d,
real *B_d, real* B_d,
real *C_d, real* C_d,
int dimM, int dimM,
int dimN, int dimN,
real alpha, real alpha,
...@@ -36,8 +35,8 @@ void hl_matrix_add(real *A_d, ...@@ -36,8 +35,8 @@ void hl_matrix_add(real *A_d,
CHECK_NOTNULL(B_d); CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_gpu_apply_ternary_op hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
<real, ternary::_add<real>, 0, 0>(ternary::_add<real>(alpha, beta), ternary::_add<real>(alpha, beta),
A_d, A_d,
B_d, B_d,
C_d, C_d,
...@@ -50,12 +49,11 @@ void hl_matrix_add(real *A_d, ...@@ -50,12 +49,11 @@ void hl_matrix_add(real *A_d,
} }
#ifdef PADDLE_TYPE_DOUBLE #ifdef PADDLE_TYPE_DOUBLE
#define THRESHOLD 128 #define THRESHOLD 128
#else #else
#define THRESHOLD 64 #define THRESHOLD 64
#endif #endif
__device__ __forceinline__ __device__ __forceinline__ void findMax(real* I,
void findMax(real* I,
real* dfMax_s, real* dfMax_s,
int blockSize, int blockSize,
int base, int base,
...@@ -89,8 +87,7 @@ void findMax(real* I, ...@@ -89,8 +87,7 @@ void findMax(real* I,
__syncthreads(); __syncthreads();
} }
__device__ __forceinline__ __device__ __forceinline__ void subMaxAndExp(real* I,
void subMaxAndExp(real* I,
real* O, real* O,
int curIdx, int curIdx,
int nextIdx, int nextIdx,
...@@ -115,8 +112,7 @@ void subMaxAndExp(real* I, ...@@ -115,8 +112,7 @@ void subMaxAndExp(real* I,
__syncthreads(); __syncthreads();
} }
__device__ __forceinline__ __device__ __forceinline__ void valueSum(real* O,
void valueSum(real* O,
real* dfMax_s, real* dfMax_s,
int blockSize, int blockSize,
int base, int base,
...@@ -141,13 +137,8 @@ void valueSum(real* O, ...@@ -141,13 +137,8 @@ void valueSum(real* O,
__syncthreads(); __syncthreads();
} }
__device__ __forceinline__ __device__ __forceinline__ void divSum(
void divSum(real* O, real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
real sum,
int curIdx,
int nextIdx,
int blockSize,
int dimN) {
while (curIdx < dimN) { while (curIdx < dimN) {
O[nextIdx] /= sum; O[nextIdx] /= sum;
nextIdx += blockSize; nextIdx += blockSize;
...@@ -155,8 +146,7 @@ void divSum(real* O, ...@@ -155,8 +146,7 @@ void divSum(real* O,
} }
} }
__device__ __forceinline__ __device__ __forceinline__ void softmax(real* I,
void softmax(real* I,
real* O, real* O,
real* dfMax_s, real* dfMax_s,
int blockSize, int blockSize,
...@@ -167,8 +157,7 @@ void softmax(real* I, ...@@ -167,8 +157,7 @@ void softmax(real* I,
__shared__ real max; __shared__ real max;
// find the max number // find the max number
findMax(I, dfMax_s, blockSize, base, curIdx, findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
nextIdx, dimN, &max);
// sub max Value and do Exp operation // sub max Value and do Exp operation
subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max); subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
...@@ -181,8 +170,8 @@ void softmax(real* I, ...@@ -181,8 +170,8 @@ void softmax(real* I,
divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN); divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
} }
template<int blockSize> template <int blockSize>
__global__ void KeMatrixSoftMax(real *O, real *I, int dimN) { __global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
int base = threadIdx.x; int base = threadIdx.x;
__shared__ real dfMax_s[blockSize]; __shared__ real dfMax_s[blockSize];
int nextIdx = blockIdx.x * dimN + base; int nextIdx = blockIdx.x * dimN + base;
...@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) { ...@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN); softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
} }
void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) { void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
dim3 block(512, 1); dim3 block(512, 1);
dim3 grid(dimM, 1); dim3 grid(dimM, 1);
KeMatrixSoftMax<512> KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
<<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
CHECK_SYNC("hl_matrix_softmax failed"); CHECK_SYNC("hl_matrix_softmax failed");
} }
template<int blockSize> template <int blockSize>
__global__ void KeSequenceSoftMax(real *O, real *I, const int* index) { __global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
int base = threadIdx.x; int base = threadIdx.x;
int bid = blockIdx.x; int bid = blockIdx.x;
__shared__ real dfMax_s[blockSize]; __shared__ real dfMax_s[blockSize];
...@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) { ...@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN); softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
} }
void hl_sequence_softmax_forward(real *A_d, void hl_sequence_softmax_forward(real* A_d,
real *C_d, real* C_d,
const int* index, const int* index,
int numSequence) { int numSequence) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
...@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d, ...@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
dim3 block(512, 1); dim3 block(512, 1);
dim3 grid(numSequence, 1); dim3 grid(numSequence, 1);
KeSequenceSoftMax<512> KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
<<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
CHECK_SYNC("hl_sequence_softmax_forward failed"); CHECK_SYNC("hl_sequence_softmax_forward failed");
} }
__global__ void KeMatrixDerivative(real *grad_d, __global__ void KeMatrixDerivative(
real *output_d, real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
real *sftmaxSum_d, int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int dimM, int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
int dimN) {
int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
int index; int index;
if (rowIdx < dimM && colIdx < dimN) { if (rowIdx < dimM && colIdx < dimN) {
index = rowIdx*dimN + colIdx; index = rowIdx * dimN + colIdx;
grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]); grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
} }
} }
void hl_matrix_softmax_derivative(real *grad_d, void hl_matrix_softmax_derivative(
real *output_d, real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
real *sftmaxSum_d,
int dimM,
int dimN) {
CHECK_NOTNULL(grad_d); CHECK_NOTNULL(grad_d);
CHECK_NOTNULL(output_d); CHECK_NOTNULL(output_d);
CHECK_NOTNULL(sftmaxSum_d); CHECK_NOTNULL(sftmaxSum_d);
int blocksX = (dimM + 0) / 1; int blocksX = (dimM + 0) / 1;
int blocksY = (dimN + 1024 -1) / 1024; int blocksY = (dimN + 1024 - 1) / 1024;
dim3 threads(1, 1024); dim3 threads(1, 1024);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixDerivative<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_d, output_d, sftmaxSum_d, dimM, dimN); grad_d, output_d, sftmaxSum_d, dimM, dimN);
CHECK_SYNC("hl_matrix_softmax_derivative failed"); CHECK_SYNC("hl_matrix_softmax_derivative failed");
} }
__global__ void KeMatrixMultiBinaryCrossEntropy(real* output, __global__ void KeMatrixMultiBinaryCrossEntropy(
real* entropy, real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
int* row,
int* col,
int dimM,
int dimN) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < dimM) { if (index < dimM) {
for (int i = 0; i < dimN; i ++) { for (int i = 0; i < dimN; i++) {
entropy[index] -= log(1 - output[index * dimN + i]); entropy[index] -= log(1 - output[index * dimN + i]);
} }
int *row_col = col + row[index]; int* row_col = col + row[index];
int col_num = row[index + 1] - row[index]; int col_num = row[index + 1] - row[index];
for (int i = 0; i < col_num; i ++) { for (int i = 0; i < col_num; i++) {
real o = output[index * dimN + row_col[i]]; real o = output[index * dimN + row_col[i]];
entropy[index] -= log(o / (1 - o)); entropy[index] -= log(o / (1 - o));
} }
...@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output, ...@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
dim3 threads(n_threads); dim3 threads(n_threads);
dim3 grid(blocks); dim3 grid(blocks);
hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix); hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
KeMatrixMultiBinaryCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, entropy, mat->csr_row, mat->csr_col, dimM, dimN); output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed"); CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
} }
__global__ void KeMatrixMultiBinaryCrossEntropyBp(real* output, __global__ void KeMatrixMultiBinaryCrossEntropyBp(
real* grad, real* output, real* grad, int* row, int* col, int dimM, int dimN) {
int* row,
int* col,
int dimM,
int dimN) {
int row_idx = blockIdx.x * blockDim.x + threadIdx.x; int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (row_idx < dimM) { if (row_idx < dimM) {
for (int i = 0; i < dimN; i ++) { for (int i = 0; i < dimN; i++) {
int index = row_idx * dimN + i; int index = row_idx * dimN + i;
grad[index] += 1.0 / (1 - output[index]); grad[index] += 1.0 / (1 - output[index]);
} }
int col_num = row[row_idx + 1] - row[row_idx]; int col_num = row[row_idx + 1] - row[row_idx];
int *row_col = col + row[row_idx]; int* row_col = col + row[row_idx];
for (int i = 0; i < col_num; i ++) { for (int i = 0; i < col_num; i++) {
int index = row_idx * dimN + row_col[i]; int index = row_idx * dimN + row_col[i];
grad[index] -= 1.0 / (output[index] * (1 - output[index])); grad[index] -= 1.0 / (output[index] * (1 - output[index]));
} }
} }
} }
void hl_matrix_multi_binary_cross_entropy_bp(real* output, void hl_matrix_multi_binary_cross_entropy_bp(
real* grad, real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
hl_sparse_matrix_s csr_mat,
int dimM,
int dimN) {
CHECK_NOTNULL(output); CHECK_NOTNULL(output);
CHECK_NOTNULL(grad); CHECK_NOTNULL(grad);
CHECK_NOTNULL(csr_mat); CHECK_NOTNULL(csr_mat);
...@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output, ...@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
dim3 threads(n_threads); dim3 threads(n_threads);
dim3 grid(blocks); dim3 grid(blocks);
hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix); hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
KeMatrixMultiBinaryCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, grad, mat->csr_row, mat->csr_col, dimM, dimN); output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed"); CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
} }
__global__ void KeMatrixCrossEntropy(real* O, __global__ void KeMatrixCrossEntropy(
real* E, real* O, real* E, int* label, int dimM, int dimN) {
int* label,
int dimM,
int dimN) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
int newBase; int newBase;
if (index < dimM) { if (index < dimM) {
...@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O, ...@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
} }
} }
void hl_matrix_cross_entropy(real* A_d, void hl_matrix_cross_entropy(
real* C_d, real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
int* label_d,
int dimM,
int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
int blocks = (dimM + 1024 - 1) / 1024; int blocks = (dimM + 1024 - 1) / 1024;
dim3 threads(1024, 1); dim3 threads(1024, 1);
dim3 grid(blocks, 1); dim3 grid(blocks, 1);
KeMatrixCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
(A_d, C_d, label_d, dimM, dimN); A_d, C_d, label_d, dimM, dimN);
CHECK_SYNC("hl_matrix_cross_entropy failed"); CHECK_SYNC("hl_matrix_cross_entropy failed");
} }
__global__ void KeMatrixCrossEntropyBp(real* grad_d, __global__ void KeMatrixCrossEntropyBp(
real* output_d, real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
int* label_d, int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int dimM, int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
int dimN) {
int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
int index; int index;
if (rowIdx < dimM && colIdx < dimN) { if (rowIdx < dimM && colIdx < dimN) {
index = rowIdx*dimN + colIdx; index = rowIdx * dimN + colIdx;
if (label_d[rowIdx] == colIdx) { if (label_d[rowIdx] == colIdx) {
grad_d[index] -= 1.0f / output_d[index]; grad_d[index] -= 1.0f / output_d[index];
} }
} }
} }
void hl_matrix_cross_entropy_bp(real* grad_d, void hl_matrix_cross_entropy_bp(
real* output_d, real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
int* label_d,
int dimM,
int dimN) {
CHECK_NOTNULL(grad_d); CHECK_NOTNULL(grad_d);
CHECK_NOTNULL(output_d); CHECK_NOTNULL(output_d);
CHECK_NOTNULL(label_d); CHECK_NOTNULL(label_d);
int blocksX = (dimM + 0)/1; int blocksX = (dimM + 0) / 1;
int blocksY = (dimN + 1024 -1) / 1024; int blocksY = (dimN + 1024 - 1) / 1024;
dim3 threads(1, 1024); dim3 threads(1, 1024);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_d, output_d, label_d, dimM, dimN); grad_d, output_d, label_d, dimM, dimN);
CHECK_SYNC("hl_matrix_cross_entropy_bp failed"); CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
} }
void hl_matrix_zero_mem(real* data, int num) { void hl_matrix_zero_mem(real* data, int num) {
hl_gpu_apply_unary_op( hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
unary::Zero<real>(), data, 1, num, num);
} }
__global__ void KeParamReluForward(real* output, __global__ void KeParamReluForward(real* output,
...@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output, ...@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
int ty = blockIdx.y * blockDim.y + threadIdx.y; int ty = blockIdx.y * blockDim.y + threadIdx.y;
if (tx < width && ty < height) { if (tx < width && ty < height) {
int index = ty * width + tx; int index = ty * width + tx;
output[index] = input[index] > 0 ? input[index] : output[index] =
input[index] * w[tx / partial_sum]; input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
} }
} }
...@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output, ...@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
CHECK_NOTNULL(w); CHECK_NOTNULL(w);
dim3 threads(16, 16); dim3 threads(16, 16);
int blockX = (width + 16 - 1) / 16; int blockX = (width + 16 - 1) / 16;
int blockY = (height + 16 -1) / 16; int blockY = (height + 16 - 1) / 16;
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>> KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, input, w, width, height, partial_sum); output, input, w, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_forward failed"); CHECK_SYNC("hl_param_relu_forward failed");
} }
template<int blockSize> template <int blockSize>
__global__ void KeParamReluBackWardW(real* grad_w, __global__ void KeParamReluBackWardW(real* grad_w,
real* grad_o, real* grad_o,
real* input, real* input,
...@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w, ...@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
int grid_num = width / partial_sum; int grid_num = width / partial_sum;
dim3 threads(blockSize, 1); dim3 threads(blockSize, 1);
dim3 grid(grid_num, 1); dim3 grid(grid_num, 1);
KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>> KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_w, grad_o, input, width, height, partial_sum); grad_w, grad_o, input, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_backward_w failed"); CHECK_SYNC("hl_param_relu_backward_w failed");
} }
...@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o, ...@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
CHECK_NOTNULL(diff); CHECK_NOTNULL(diff);
dim3 threads(16, 16); dim3 threads(16, 16);
int blockX = (width + 16 - 1) / 16; int blockX = (width + 16 - 1) / 16;
int blockY = (height + 16 -1) / 16; int blockY = (height + 16 - 1) / 16;
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>> KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_o, data, w, diff, width, height, partial_sum); grad_o, data, w, diff, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_backward_diff failed"); CHECK_SYNC("hl_param_relu_backward_diff failed");
} }
__global__ void KeMatrixAddSharedBias(real* A, __global__ void KeMatrixAddSharedBias(
real* B, real* A, real* B, const int channel, const int M, const int N, real scale) {
const int channel,
const int M,
const int N,
real scale) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
int dim = N / channel; int dim = N / channel;
if (index < M * N) { if (index < M * N) {
...@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d, ...@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
real scale) { real scale) {
const int blocks = 512; const int blocks = 512;
const int grids = DIVUP(dimM * dimN, blocks); const int grids = DIVUP(dimM * dimN, blocks);
KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>> KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
(A_d, B_d, channel, dimM, dimN, scale); A_d, B_d, channel, dimM, dimN, scale);
CHECK_SYNC("hl_matrix_add_shared_bias failed"); CHECK_SYNC("hl_matrix_add_shared_bias failed");
} }
template <int blockSize> template <int blockSize>
__global__ void KeMatrixCollectSharedBias(real *B, __global__ void KeMatrixCollectSharedBias(real* B,
real *A, real* A,
const int channel, const int channel,
const int M, const int M,
const int N, const int N,
...@@ -611,14 +563,13 @@ void hl_matrix_collect_shared_bias(real* B_d, ...@@ -611,14 +563,13 @@ void hl_matrix_collect_shared_bias(real* B_d,
const int limit = 64; const int limit = 64;
int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel; int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
KeMatrixCollectSharedBias<blocks> KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
<<< grids, blocks, 0, STREAM_DEFAULT>>> B_d, A_d, channel, dimM, dimN, dim, limit, scale);
(B_d, A_d, channel, dimM, dimN, dim, limit, scale);
CHECK_SYNC("hl_matrix_collect_shared_bias failed"); CHECK_SYNC("hl_matrix_collect_shared_bias failed");
} }
__global__ void keMatrixRotate(real* mat, real* matRot, __global__ void keMatrixRotate(
int dimM, int dimN, bool clockWise) { real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
int idx = blockIdx.x * blockDim.x + threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < dimM * dimN) { if (idx < dimM * dimN) {
int i = idx / dimN; int i = idx / dimN;
...@@ -631,13 +582,13 @@ __global__ void keMatrixRotate(real* mat, real* matRot, ...@@ -631,13 +582,13 @@ __global__ void keMatrixRotate(real* mat, real* matRot,
} }
} }
void hl_matrix_rotate(real *mat, real* matRot, void hl_matrix_rotate(
int dimM, int dimN, bool clockWise) { real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
CHECK_NOTNULL(mat); CHECK_NOTNULL(mat);
CHECK_NOTNULL(matRot); CHECK_NOTNULL(matRot);
const int threads = 512; const int threads = 512;
const int blocks = DIVUP(dimM * dimN, threads); const int blocks = DIVUP(dimM * dimN, threads);
keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>> keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
(mat, matRot, dimM, dimN, clockWise); mat, matRot, dimM, dimN, clockWise);
CHECK_SYNC("hl_matrix_rotate failed"); CHECK_SYNC("hl_matrix_rotate failed");
} }
...@@ -16,36 +16,36 @@ limitations under the License. */ ...@@ -16,36 +16,36 @@ limitations under the License. */
#include "hl_device_functions.cuh" #include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
__global__ void KeMaxSequenceForward(real *input, __global__ void KeMaxSequenceForward(real* input,
const int *sequence, const int* sequence,
real* output, real* output,
int *index, int* index,
int numSequences, int numSequences,
int dim) { int dim) {
int dimIdx = threadIdx.x; int dimIdx = threadIdx.x;
int sequenceId = blockIdx.x; int sequenceId = blockIdx.x;
if (sequenceId >= numSequences) return; if (sequenceId >= numSequences) return;
int start = sequence[sequenceId]; int start = sequence[sequenceId];
int end = sequence[sequenceId+1]; int end = sequence[sequenceId + 1];
for (int i = dimIdx; i < dim; i += blockDim.x) { for (int i = dimIdx; i < dim; i += blockDim.x) {
real tmp = -HL_FLOAT_MAX; real tmp = -HL_FLOAT_MAX;
int tmpId = -1; int tmpId = -1;
for (int insId = start; insId < end; insId++) { for (int insId = start; insId < end; insId++) {
if (tmp < input[insId*dim + i]) { if (tmp < input[insId * dim + i]) {
tmp = input[insId*dim + i]; tmp = input[insId * dim + i];
tmpId = insId; tmpId = insId;
} }
} }
output[sequenceId*dim + i] = tmp; output[sequenceId * dim + i] = tmp;
index[sequenceId*dim + i] = tmpId; index[sequenceId * dim + i] = tmpId;
} }
} }
void hl_max_sequence_forward(real* input, void hl_max_sequence_forward(real* input,
const int* sequence, const int* sequence,
real* output, real* output,
int *index, int* index,
int numSequences, int numSequences,
int dim) { int dim) {
CHECK_NOTNULL(input); CHECK_NOTNULL(input);
...@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input, ...@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
dim3 threads(256, 1); dim3 threads(256, 1);
dim3 grid(numSequences, 1); dim3 grid(numSequences, 1);
KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>> KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, sequence, output, index, numSequences, dim); input, sequence, output, index, numSequences, dim);
CHECK_SYNC("hl_max_sequence_forward failed"); CHECK_SYNC("hl_max_sequence_forward failed");
} }
__global__ void KeMaxSequenceBackward(real *outputGrad, __global__ void KeMaxSequenceBackward(
int *index, real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
real* inputGrad,
int numSequences,
int dim) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
int colIdx = idx % dim; int colIdx = idx % dim;
if (idx < numSequences*dim) { if (idx < numSequences * dim) {
int insId = index[idx]; int insId = index[idx];
inputGrad[insId * dim + colIdx] += outputGrad[idx]; inputGrad[insId * dim + colIdx] += outputGrad[idx];
} }
} }
void hl_max_sequence_backward(real* outputGrad, void hl_max_sequence_backward(
int *index, real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
real* inputGrad,
int numSequences,
int dim) {
CHECK_NOTNULL(outputGrad); CHECK_NOTNULL(outputGrad);
CHECK_NOTNULL(index); CHECK_NOTNULL(index);
CHECK_NOTNULL(inputGrad); CHECK_NOTNULL(inputGrad);
...@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad, ...@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
unsigned int blocks = (numSequences * dim + 128 - 1) / 128; unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocks, 1); dim3 grid(blocks, 1);
KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>> KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
(outputGrad, index, inputGrad, numSequences, dim); outputGrad, index, inputGrad, numSequences, dim);
CHECK_SYNC("hl_max_sequence_backward failed"); CHECK_SYNC("hl_max_sequence_backward failed");
} }
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow> template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output, __global__ void KeMatrixAddRows(real* output,
real* table, real* table,
int* ids, int* ids,
...@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output, ...@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
while (sampleId < numSamples) { while (sampleId < numSamples) {
int tableId = ids[sampleId]; int tableId = ids[sampleId];
if ((0 <= tableId) && (tableId < tableSize)) { if ((0 <= tableId) && (tableId < tableSize)) {
real *outputData = output + sampleId * dim; real* outputData = output + sampleId * dim;
real *tableData = table + tableId * dim; real* tableData = table + tableId * dim;
for (int i = idx; i < dim; i += blockDimX) { for (int i = idx; i < dim; i += blockDimX) {
if (AddRow == 0) { if (AddRow == 0) {
outputData[i] += tableData[i]; outputData[i] += tableData[i];
...@@ -114,15 +108,18 @@ __global__ void KeMatrixAddRows(real* output, ...@@ -114,15 +108,18 @@ __global__ void KeMatrixAddRows(real* output,
} }
} }
} }
sampleId += blockDimY*gridDimX; sampleId += blockDimY * gridDimX;
} }
} }
template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd> template <int blockDimX,
__global__ int blockDimY,
void KeSequence2Batch(real *batch, int gridDimX,
real *sequence, bool seq2batch,
const int *batchIndex, bool isAdd>
__global__ void KeSequence2Batch(real* batch,
real* sequence,
const int* batchIndex,
int seqWidth, int seqWidth,
int batchCount) { int batchCount) {
int idx = threadIdx.x; int idx = threadIdx.x;
...@@ -130,8 +127,8 @@ void KeSequence2Batch(real *batch, ...@@ -130,8 +127,8 @@ void KeSequence2Batch(real *batch,
int id = blockIdx.x + idy * gridDimX; int id = blockIdx.x + idy * gridDimX;
while (id < batchCount) { while (id < batchCount) {
int seqId = batchIndex[id]; int seqId = batchIndex[id];
real* batchData = batch + id*seqWidth; real* batchData = batch + id * seqWidth;
real* seqData = sequence + seqId*seqWidth; real* seqData = sequence + seqId * seqWidth;
for (int i = idx; i < seqWidth; i += blockDimX) { for (int i = idx; i < seqWidth; i += blockDimX) {
if (seq2batch) { if (seq2batch) {
if (isAdd) { if (isAdd) {
...@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch, ...@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
} }
} }
} }
id += blockDimY*gridDimX; id += blockDimY * gridDimX;
} }
} }
void hl_sequence2batch_copy(real *batch, void hl_sequence2batch_copy(real* batch,
real *sequence, real* sequence,
const int *batchIndex, const int* batchIndex,
int seqWidth, int seqWidth,
int batchCount, int batchCount,
bool seq2batch) { bool seq2batch) {
...@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch, ...@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
if (seq2batch) { if (seq2batch) {
KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} else { } else {
KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} }
CHECK_SYNC("hl_sequence2batch_copy failed"); CHECK_SYNC("hl_sequence2batch_copy failed");
} }
void hl_sequence2batch_add(real *batch, void hl_sequence2batch_add(real* batch,
real *sequence, real* sequence,
int *batchIndex, int* batchIndex,
int seqWidth, int seqWidth,
int batchCount, int batchCount,
bool seq2batch) { bool seq2batch) {
...@@ -186,18 +183,17 @@ void hl_sequence2batch_add(real *batch, ...@@ -186,18 +183,17 @@ void hl_sequence2batch_add(real *batch,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
if (seq2batch) { if (seq2batch) {
KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} else { } else {
KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} }
CHECK_SYNC("hl_sequence2batch_add failed"); CHECK_SYNC("hl_sequence2batch_add failed");
} }
template<bool normByTimes, bool seq2batch> template <bool normByTimes, bool seq2batch>
__global__ __global__ void KeSequence2BatchPadding(real* batch,
void KeSequence2BatchPadding(real* batch,
real* sequence, real* sequence,
const int* sequenceStartPositions, const int* sequenceStartPositions,
const size_t sequenceWidth, const size_t sequenceWidth,
...@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch, ...@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
if (seq2batch) { if (seq2batch) {
/* sequence -> batch */ /* sequence -> batch */
if (normByTimes) { if (normByTimes) {
KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} else { } else {
KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} }
} else { } else {
/* batch -> sequence */ /* batch -> sequence */
if (normByTimes) { if (normByTimes) {
KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} else { } else {
KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} }
} }
CHECK_SYNC("hl_sequence2batch_copy_padding failed"); CHECK_SYNC("hl_sequence2batch_copy_padding failed");
} }
__device__ inline float my_rsqrt(float x) { __device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
return rsqrtf(x);
}
__device__ inline double my_rsqrt(double x) { __device__ inline double my_rsqrt(double x) { return rsqrt(x); }
return rsqrt(x);
}
__global__ void KeSequenceAvgForward(real* dst, __global__ void KeSequenceAvgForward(real* dst,
real* src, real* src,
...@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst, ...@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
sum += src[i * width + col]; sum += src[i * width + col];
} }
sum = mode == 1 ? sum : sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
(mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength)); : sum * my_rsqrt((real)seqLength));
dst[gid] += sum; dst[gid] += sum;
} }
} }
...@@ -349,8 +357,8 @@ void hl_sequence_avg_forward(real* dst, ...@@ -349,8 +357,8 @@ void hl_sequence_avg_forward(real* dst,
CHECK(mode == 0 || mode == 1 || mode == 2) CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_forward!"; << "mode error in hl_sequence_avg_forward!";
KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>> KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
(dst, src, starts, height, width, mode); dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_forward failed"); CHECK_SYNC("hl_sequence_avg_forward failed");
} }
...@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst, ...@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
int seqLength = end - start; int seqLength = end - start;
if (seqLength == 0) return; if (seqLength == 0) return;
real grad = src[gid]; real grad = src[gid];
grad = mode == 1 ? grad : grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
(mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength)); : grad * my_rsqrt((real)seqLength));
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
dst[i * width + col] += grad; dst[i * width + col] += grad;
} }
...@@ -394,7 +402,7 @@ void hl_sequence_avg_backward(real* dst, ...@@ -394,7 +402,7 @@ void hl_sequence_avg_backward(real* dst,
CHECK(mode == 0 || mode == 1 || mode == 2) CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_backward!"; << "mode error in hl_sequence_avg_backward!";
KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>> KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
(dst, src, starts, height, width, mode); dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_backward failed"); CHECK_SYNC("hl_sequence_avg_backward failed");
} }
此差异已折叠。
...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cmath>
#include <stdlib.h> #include <stdlib.h>
#include "hl_cuda.h" #include <cmath>
#include "hl_time.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_cuda.h"
#include "hl_perturbation_util.cuh" #include "hl_perturbation_util.cuh"
#include "hl_time.h"
#define _USE_MATH_DEFINES #define _USE_MATH_DEFINES
...@@ -30,10 +29,16 @@ limitations under the License. */ ...@@ -30,10 +29,16 @@ limitations under the License. */
* centerX, centerY: translation. * centerX, centerY: translation.
* sourceX, sourceY: output coordinates in the original image. * sourceX, sourceY: output coordinates in the original image.
*/ */
__device__ void getTranformCoord(int x, int y, real theta, real scale, __device__ void getTranformCoord(int x,
real tgtCenter, real imgCenter, int y,
real centerR, real centerC, real theta,
int* sourceX, int* sourceY) { real scale,
real tgtCenter,
real imgCenter,
real centerR,
real centerC,
int* sourceX,
int* sourceY) {
real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)}; real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
// compute coornidates in the rotated and scaled image // compute coornidates in the rotated and scaled image
...@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale, ...@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
* created by Wei Xu (genome), converted by Jiang Wang * created by Wei Xu (genome), converted by Jiang Wang
*/ */
__global__ void kSamplingPatches(const real* imgs, real* targets, __global__ void kSamplingPatches(const real* imgs,
int imgSize, int tgtSize, const int channels, real* targets,
int samplingRate, const real* thetas, int imgSize,
const real* scales, const int* centerRs, int tgtSize,
const int* centerCs, const real padValue, const int channels,
int samplingRate,
const real* thetas,
const real* scales,
const int* centerRs,
const int* centerCs,
const real padValue,
const int numImages) { const int numImages) {
const int caseIdx = blockIdx.x * 4 + threadIdx.x; const int caseIdx = blockIdx.x * 4 + threadIdx.x;
const int pxIdx = blockIdx.y * 128 + threadIdx.y; const int pxIdx = blockIdx.y * 128 + threadIdx.y;
...@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, ...@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
const int pxY = pxIdx / tgtSize; const int pxY = pxIdx / tgtSize;
int srcPxX, srcPxY; int srcPxX, srcPxY;
getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter, getTranformCoord(pxX,
imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX, pxY,
thetas[imgIdx],
scales[imgIdx],
tgtCenter,
imgCenter,
centerCs[caseIdx],
centerRs[caseIdx],
&srcPxX,
&srcPxY); &srcPxY);
imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels; imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
...@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, ...@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
* *
* created by Wei Xu * created by Wei Xu
*/ */
void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, void hl_generate_disturb_params(real*& gpuAngle,
int*& gpuCenterR, int*& gpuCenterC, real*& gpuScaleRatio,
int numImages, int imgSize, real rotateAngle, int*& gpuCenterR,
real scaleRatio, int samplingRate, int*& gpuCenterC,
int numImages,
int imgSize,
real rotateAngle,
real scaleRatio,
int samplingRate,
bool isTrain) { bool isTrain) {
// The number of output samples. // The number of output samples.
int numPatches = numImages * samplingRate; int numPatches = numImages * samplingRate;
...@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
for (int i = 0; i < numImages; i++) { for (int i = 0; i < numImages; i++) {
r_angle[i] = r_angle[i] =
(rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT
- 0.5); -
0.5);
s_ratio[i] = s_ratio[i] =
1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT 1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT
} }
...@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
int pxY = int pxY =
(int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT
const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]), const real H[4] = {cos(-r_angle[i]),
sin(-r_angle[i]), cos(-r_angle[i])}; -sin(-r_angle[i]),
sin(-r_angle[i]),
cos(-r_angle[i])};
real x = pxX - imgCenter; real x = pxX - imgCenter;
real y = pxY - imgCenter; real y = pxY - imgCenter;
real xx = H[0] * x + H[1] * y; real xx = H[0] * x + H[1] * y;
...@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
delete[] center_c; delete[] center_c;
} }
void hl_conv_random_disturb_with_params(const real* images, int imgSize, void hl_conv_random_disturb_with_params(const real* images,
int tgtSize, int channels, int imgSize,
int numImages, int samplingRate, int tgtSize,
int channels,
int numImages,
int samplingRate,
const real* gpuRotationAngle, const real* gpuRotationAngle,
const real* gpuScaleRatio, const real* gpuScaleRatio,
const int* gpuCenterR, const int* gpuCenterR,
...@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize, ...@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
dim3 threadsPerBlock(4, 128); dim3 threadsPerBlock(4, 128);
dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128)); dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
kSamplingPatches <<<numBlocks, threadsPerBlock>>> kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
(images, target, imgSize, tgtSize, channels, samplingRate, target,
gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, imgSize,
paddingValue, numImages); tgtSize,
channels,
samplingRate,
gpuRotationAngle,
gpuScaleRatio,
gpuCenterR,
gpuCenterC,
paddingValue,
numImages);
hl_device_synchronize(); hl_device_synchronize();
} }
void hl_conv_random_disturb(const real* images, int imgSize, void hl_conv_random_disturb(const real* images,
int tgtSize, int channels, int numImages, int imgSize,
real scaleRatio, real rotateAngle, int tgtSize,
int samplingRate, real* gpu_r_angle, int channels,
real* gpu_s_ratio, int* gpu_center_r, int numImages,
int* gpu_center_c, int paddingValue, real scaleRatio,
bool isTrain, real* targets) { real rotateAngle,
int samplingRate,
real* gpu_r_angle,
real* gpu_s_ratio,
int* gpu_center_r,
int* gpu_center_c,
int paddingValue,
bool isTrain,
real* targets) {
// generate the random disturbance sequence and the sampling locations // generate the random disturbance sequence and the sampling locations
hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r, hl_generate_disturb_params(gpu_r_angle,
gpu_center_c, numImages, imgSize, rotateAngle, gpu_s_ratio,
scaleRatio, samplingRate, isTrain); gpu_center_r,
gpu_center_c,
hl_conv_random_disturb_with_params( numImages,
images, imgSize, tgtSize, channels, numImages, imgSize,
samplingRate, gpu_r_angle, gpu_s_ratio, rotateAngle,
gpu_center_r, gpu_center_r, paddingValue, scaleRatio,
samplingRate,
isTrain);
hl_conv_random_disturb_with_params(images,
imgSize,
tgtSize,
channels,
numImages,
samplingRate,
gpu_r_angle,
gpu_s_ratio,
gpu_center_r,
gpu_center_r,
paddingValue,
targets); targets);
} }
...@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_cuda.h" #include "hl_cuda.h"
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow> template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output, int ldo, __global__ void KeMatrixAddRows(real* output,
real* table, int ldt, int ldo,
real* table,
int ldt,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
...@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo, ...@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
while (idy < numSamples) { while (idy < numSamples) {
int tableId = ids[idy]; int tableId = ids[idy];
if ((0 <= tableId) && (tableId < tableSize)) { if ((0 <= tableId) && (tableId < tableSize)) {
real *out = output + idy * ldo; real* out = output + idy * ldo;
real *tab = table + tableId * ldt; real* tab = table + tableId * ldt;
for (int i = idx; i < dim; i += blockDimX) { for (int i = idx; i < dim; i += blockDimX) {
if (AddRow) { if (AddRow) {
paddle::paddleAtomicAdd(&tab[i], out[i]); paddle::paddleAtomicAdd(&tab[i], out[i]);
...@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo, ...@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
} }
} }
void hl_matrix_select_rows(real* output, int ldo, void hl_matrix_select_rows(real* output,
real* table, int ldt, int ldo,
real* table,
int ldt,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
...@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo, ...@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, ldo, table, ldt, ids, numSamples, tableSize, dim); output, ldo, table, ldt, ids, numSamples, tableSize, dim);
CHECK_SYNC("hl_matrix_select_rows failed"); CHECK_SYNC("hl_matrix_select_rows failed");
} }
void hl_matrix_add_to_rows(real* table, int ldt, void hl_matrix_add_to_rows(real* table,
real* input, int ldi, int ldt,
real* input,
int ldi,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
...@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt, ...@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, ldi, table, ldt, ids, numSamples, tableSize, dim); input, ldi, table, ldt, ids, numSamples, tableSize, dim);
CHECK_SYNC("hl_matrix_add_to_rows failed"); CHECK_SYNC("hl_matrix_add_to_rows failed");
} }
template<class T, int blockDimX, int gridDimX> template <class T, int blockDimX, int gridDimX>
__global__ void KeVectorSelect(T* dst, int sized, __global__ void KeVectorSelect(
const T* src, int sizes, T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
const int* ids, int sizei) {
int idx = threadIdx.x + blockDimX * blockIdx.x; int idx = threadIdx.x + blockDimX * blockIdx.x;
while (idx < sizei) { while (idx < sizei) {
int index = ids[idx]; int index = ids[idx];
...@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized, ...@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
} }
template <class T> template <class T>
void hl_vector_select_from(T* dst, int sized, void hl_vector_select_from(
const T* src, int sizes, T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
const int* ids, int sizei) {
CHECK_NOTNULL(dst); CHECK_NOTNULL(dst);
CHECK_NOTNULL(src); CHECK_NOTNULL(src);
CHECK_NOTNULL(ids); CHECK_NOTNULL(ids);
...@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized, ...@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
dim3 threads(512, 1); dim3 threads(512, 1);
dim3 grid(8, 1); dim3 grid(8, 1);
KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
(dst, sized, src, sizes, ids, sizei); dst, sized, src, sizes, ids, sizei);
CHECK_SYNC("hl_vector_select_from failed"); CHECK_SYNC("hl_vector_select_from failed");
} }
template template void hl_vector_select_from(real* dst,
void hl_vector_select_from(real* dst, int sized, int sized,
const real* src, int sizes, const real* src,
const int* ids, int sizei); int sizes,
template const int* ids,
void hl_vector_select_from(int* dst, int sized, int sizei);
const int* src, int sizes, template void hl_vector_select_from(
const int* ids, int sizei); int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
...@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
#include "hl_top_k.h"
#include "hl_sparse.ph" #include "hl_sparse.ph"
#include "hl_top_k.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
// using namespace hppl; // using namespace hppl;
struct Pair { struct Pair {
__device__ __forceinline__ __device__ __forceinline__ Pair() {}
Pair() {}
__device__ __forceinline__ __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
Pair(real value, int id) : v_(value), id_(id) {}
__device__ __forceinline__ __device__ __forceinline__ void set(real value, int id) {
void set(real value, int id) {
v_ = value; v_ = value;
id_ = id; id_ = id;
} }
__device__ __forceinline__ __device__ __forceinline__ void operator=(const Pair& in) {
void operator=(const Pair& in) {
v_ = in.v_; v_ = in.v_;
id_ = in.id_; id_ = in.id_;
} }
__device__ __forceinline__ __device__ __forceinline__ bool operator<(const real value) const {
bool operator<(const real value) const {
return (v_ < value); return (v_ < value);
} }
__device__ __forceinline__ __device__ __forceinline__ bool operator<(const Pair& in) const {
bool operator<(const Pair& in) const {
return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_)); return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
} }
__device__ __forceinline__ __device__ __forceinline__ bool operator>(const Pair& in) const {
bool operator>(const Pair& in) const {
return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_)); return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
} }
...@@ -58,8 +50,9 @@ struct Pair { ...@@ -58,8 +50,9 @@ struct Pair {
int id_; int id_;
}; };
__device__ __forceinline__ __device__ __forceinline__ void addTo(Pair topK[],
void addTo(Pair topK[], const Pair &p, int beamSize) { const Pair& p,
int beamSize) {
for (int k = beamSize - 2; k >= 0; k--) { for (int k = beamSize - 2; k >= 0; k--) {
if (topK[k] < p) { if (topK[k] < p) {
topK[k + 1] = topK[k]; topK[k + 1] = topK[k];
...@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) { ...@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
topK[0] = p; topK[0] = p;
} }
template<int beamSize> template <int beamSize>
__device__ __forceinline__ __device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
void addTo(Pair topK[], const Pair &p) {
for (int k = beamSize - 2; k >= 0; k--) { for (int k = beamSize - 2; k >= 0; k--) {
if (topK[k] < p) { if (topK[k] < p) {
topK[k + 1] = topK[k]; topK[k + 1] = topK[k];
...@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) { ...@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
topK[0] = p; topK[0] = p;
} }
template<int blockSize> template <int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void getTopK(
void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) { Pair topK[], real* src, int idx, int dim, int beamSize) {
while (idx < dim) { while (idx < dim) {
if (topK[beamSize - 1] < src[idx]) { if (topK[beamSize - 1] < src[idx]) {
Pair tmp(src[idx], idx); Pair tmp(src[idx], idx);
...@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) { ...@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
} }
} }
template<int blockSize> template <int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void getTopK(
void getTopK(Pair topK[], real *src, int idx, int dim, Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
const Pair& max, int beamSize) {
while (idx < dim) { while (idx < dim) {
if (topK[beamSize - 1] < src[idx]) { if (topK[beamSize - 1] < src[idx]) {
Pair tmp(src[idx], idx); Pair tmp(src[idx], idx);
...@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, ...@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
} }
} }
template<int blockSize> template <int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void getTopK(
void getTopK(Pair topK[], real *val, int *col, Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
int idx, int dim, int beamSize) {
while (idx < dim) { while (idx < dim) {
if (topK[beamSize - 1] < val[idx]) { if (topK[beamSize - 1] < val[idx]) {
Pair tmp(val[idx], col[idx]); Pair tmp(val[idx], col[idx]);
...@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col, ...@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
} }
} }
template<int blockSize> template <int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void getTopK(Pair topK[],
void getTopK(Pair topK[], real *val, int *col, int idx, int dim, real* val,
const Pair& max, int beamSize) { int* col,
int idx,
int dim,
const Pair& max,
int beamSize) {
while (idx < dim) { while (idx < dim) {
if (topK[beamSize - 1] < val[idx]) { if (topK[beamSize - 1] < val[idx]) {
Pair tmp(val[idx], col[idx]); Pair tmp(val[idx], col[idx]);
...@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim, ...@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
} }
} }
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void threadGetTopK(Pair topK[],
void threadGetTopK(Pair topK[], int& beam, int beamSize, int& beam,
int beamSize,
real* src, real* src,
bool& firstStep, bool& isEmpty, Pair& max, bool& firstStep,
int dim, const int tid) { bool& isEmpty,
Pair& max,
int dim,
const int tid) {
if (beam > 0) { if (beam > 0) {
int length = beam < beamSize ? beam : beamSize; int length = beam < beamSize ? beam : beamSize;
if (firstStep) { if (firstStep) {
...@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, ...@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
} }
} }
if (!isEmpty) { if (!isEmpty) {
getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
max, length);
} }
} }
...@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, ...@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
} }
} }
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void threadGetTopK(Pair topK[],
void threadGetTopK(Pair topK[], int& beam, int beamSize, int& beam,
real* val, int* col, int beamSize,
bool& firstStep, bool& isEmpty, Pair& max, real* val,
int dim, const int tid) { int* col,
bool& firstStep,
bool& isEmpty,
Pair& max,
int dim,
const int tid) {
if (beam > 0) { if (beam > 0) {
int length = beam < beamSize ? beam : beamSize; int length = beam < beamSize ? beam : beamSize;
if (firstStep) { if (firstStep) {
...@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, ...@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
} }
} }
if (!isEmpty) { if (!isEmpty) {
getTopK<blockSize>(topK + maxLength - beam, val, col, tid, dim, getTopK<blockSize>(
max, length); topK + maxLength - beam, val, col, tid, dim, max, length);
} }
} }
...@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, ...@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
} }
} }
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__device__ __forceinline__ __device__ __forceinline__ void blockReduce(Pair* shTopK,
void blockReduce(Pair* shTopK, int* maxId, Pair topK[], int* maxId,
real** topVal, int** topIds, Pair topK[],
int& beam, int& beamSize, real** topVal,
const int tid, const int warp) { int** topIds,
int& beam,
int& beamSize,
const int tid,
const int warp) {
while (true) { while (true) {
__syncthreads(); __syncthreads();
if (tid < blockSize / 2) { if (tid < blockSize / 2) {
...@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[], ...@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
} }
} }
__syncthreads(); __syncthreads();
for (int stride = blockSize / 4; stride > 0; stride = stride/2) { for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) { if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
maxId[tid] = maxId[tid + stride]; maxId[tid] = maxId[tid + stride];
...@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[], ...@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
* 3. go to the second setp, until one thread's topK value is null; * 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value. * 4. go to the first setp, until get the topK value.
*/ */
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__global__ void KeMatrixTopK(real* topVal, int ldv, __global__ void KeMatrixTopK(real* topVal,
int * topIds, int ldv,
real* src, int lds, int* topIds,
real* src,
int lds,
int dim, int dim,
int beamSize) { int beamSize) {
__shared__ Pair shTopK[blockSize]; __shared__ Pair shTopK[blockSize];
...@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv, ...@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
topK[k].set(-HL_FLOAT_MAX, -1); topK[k].set(-HL_FLOAT_MAX, -1);
} }
while (beamSize) { while (beamSize) {
threadGetTopK<maxLength, blockSize> threadGetTopK<maxLength, blockSize>(
(topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0]; shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize> blockReduce<maxLength, blockSize>(
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
} }
} }
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__global__ void KeSMatrixTopK(real* topVal, int ldv, __global__ void KeSMatrixTopK(real* topVal,
int * topIds, int ldv,
int* topIds,
real* val, real* val,
int* row, int* row,
int* col, int* col,
...@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv, ...@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
topK[k].set(-HL_FLOAT_MAX, -1); topK[k].set(-HL_FLOAT_MAX, -1);
} }
while (beamSize) { while (beamSize) {
threadGetTopK<maxLength, blockSize> threadGetTopK<maxLength, blockSize>(
(topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid); topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0]; shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize> blockReduce<maxLength, blockSize>(
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
} }
} }
void hl_matrix_top_k(real* topVal, int ldv, void hl_matrix_top_k(real* topVal,
int * topIds, int ldv,
real* src, int lds, int* topIds,
real* src,
int lds,
int dim, int dim,
int beamSize, int beamSize,
int numSamples) { int numSamples) {
...@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv, ...@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
dim3 threads(256, 1); dim3 threads(256, 1);
dim3 grid(numSamples, 1); dim3 grid(numSamples, 1);
KeMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
(topVal, ldv, topIds, src, lds, dim, beamSize); topVal, ldv, topIds, src, lds, dim, beamSize);
CHECK_SYNC("hl_matrix_top_k failed"); CHECK_SYNC("hl_matrix_top_k failed");
} }
void hl_sparse_matrix_top_k(real* topVal, int ldv, void hl_sparse_matrix_top_k(real* topVal,
int * topIds, int ldv,
int* topIds,
hl_sparse_matrix_s src, hl_sparse_matrix_s src,
int beamSize, int beamSize,
int numSamples) { int numSamples) {
CHECK_NOTNULL(topVal); CHECK_NOTNULL(topVal);
CHECK_NOTNULL(topIds); CHECK_NOTNULL(topIds);
CHECK_NOTNULL(src); CHECK_NOTNULL(src);
CHECK_EQ(src->format, HL_SPARSE_CSR) CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
<<"sparse matrix format error!";
hl_csr_matrix csr = (hl_csr_matrix)src->matrix; hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
if (csr->csr_val == NULL || csr->csr_row == NULL || if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
csr->csr_col == NULL) {
LOG(FATAL) << "parameter src is null!"; LOG(FATAL) << "parameter src is null!";
} }
dim3 threads(256, 1); dim3 threads(256, 1);
dim3 grid(numSamples, 1); dim3 grid(numSamples, 1);
KeSMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>> KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
(topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize); topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
CHECK_SYNC("hl_sparse_matrix_top_k failed"); CHECK_SYNC("hl_sparse_matrix_top_k failed");
} }
...@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv, ...@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
* 3. go to the second setp, until one thread's topK value is null; * 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value. * 4. go to the first setp, until get the topK value.
*/ */
template<int maxLength, int blockSize> template <int maxLength, int blockSize>
__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, __global__ void KeMatrixTopKClassificationError(real* topVal,
int * topIds, int ldv,
real* src, int lds, int* topIds,
real* src,
int lds,
int dim, int dim,
int beamSize, int beamSize,
int* label, int* label,
...@@ -420,12 +432,12 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, ...@@ -420,12 +432,12 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
} }
while (beamSize) { while (beamSize) {
threadGetTopK<maxLength, blockSize> threadGetTopK<maxLength, blockSize>(
(topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0]; shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize> blockReduce<maxLength, blockSize>(
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
} }
__syncthreads(); __syncthreads();
...@@ -440,9 +452,11 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, ...@@ -440,9 +452,11 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
} }
} }
void hl_matrix_classification_error(real* topVal, int ldv, void hl_matrix_classification_error(real* topVal,
int ldv,
int* topIds, int* topIds,
real* src, int lds, real* src,
int lds,
int dim, int dim,
int topkSize, int topkSize,
int numSamples, int numSamples,
...@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv, ...@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
dim3 threads(256, 1); dim3 threads(256, 1);
dim3 grid(numSamples, 1); dim3 grid(numSamples, 1);
KeMatrixTopKClassificationError<5, 256> KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<< grid, threads, 0, STREAM_DEFAULT >>> topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
(topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
CHECK_SYNC("hl_matrix_top_k classification error failed"); CHECK_SYNC("hl_matrix_top_k classification error failed");
} }
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
// Attribute Type for paddle's Op. // Attribute Type for paddle's Op.
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
import "attr_type.proto"; import "attr_type.proto";
......
...@@ -15,10 +15,11 @@ limitations under the License. */ ...@@ -15,10 +15,11 @@ limitations under the License. */
// Protocol Message for 3rd-party language binding. // Protocol Message for 3rd-party language binding.
// //
// Paddle Python package will use `OpProto` to generate op creation methods. // Paddle Python package will use `OpProto` to generate op creation methods.
// The op creation methods take user's input and generate `OpDesc` proto message, // The op creation methods take user's input and generate `OpDesc` proto
// message,
// then pass `OpDesc` to C++ side and create Op pointer. // then pass `OpDesc` to C++ side and create Op pointer.
// //
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
import "attr_type.proto"; import "attr_type.proto";
...@@ -32,13 +33,14 @@ message AttrProto { ...@@ -32,13 +33,14 @@ message AttrProto {
// Supported attribute type. // Supported attribute type.
required AttrType type = 2; required AttrType type = 2;
// Supported attribute comments. It helps 3rd-party language generate doc-string. // Supported attribute comments. It helps 3rd-party language generate
// doc-string.
required string comment = 3; required string comment = 3;
// If that attribute is generated, it means the Paddle third language // If that attribute is generated, it means the Paddle third language
// binding has responsibility to fill that attribute. End-User should // binding has responsibility to fill that attribute. End-User should
// not set that attribute. // not set that attribute.
optional bool generated = 4 [default=false]; optional bool generated = 4 [ default = false ];
} }
// Input or output message for 3rd-party language binding. // Input or output message for 3rd-party language binding.
...@@ -48,7 +50,8 @@ message VarProto { ...@@ -48,7 +50,8 @@ message VarProto {
// e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names. // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
required string name = 1; required string name = 1;
// The comment for that input. It helps 3rd-party language generate doc-string. // The comment for that input. It helps 3rd-party language generate
// doc-string.
required string comment = 2; required string comment = 2;
// Is that input/output could be a list or not. // Is that input/output could be a list or not.
...@@ -70,7 +73,7 @@ message VarProto { ...@@ -70,7 +73,7 @@ message VarProto {
// } // }
// } // }
// //
optional bool multiple = 3 [default=false]; optional bool multiple = 3 [ default = false ];
// It marks that output is a temporary output. That output is not used by // It marks that output is a temporary output. That output is not used by
// user, but used by other op internally as input. If other op is not use // user, but used by other op internally as input. If other op is not use
...@@ -83,7 +86,7 @@ message VarProto { ...@@ -83,7 +86,7 @@ message VarProto {
// attrs = { // attrs = {
// "temporary_index": [1] // "temporary_index": [1]
// } // }
optional bool temporary = 4 [default=false]; optional bool temporary = 4 [ default = false ];
// The gradient of operator can be ignored immediately // The gradient of operator can be ignored immediately
// e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2 // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
...@@ -110,5 +113,4 @@ message OpProto { ...@@ -110,5 +113,4 @@ message OpProto {
// The type of that Op. // The type of that Op.
required string type = 5; required string type = 5;
} }
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "ContextProjectionOp.h" #include "ContextProjectionOp.h"
#include "hl_base.h"
namespace paddle { namespace paddle {
...@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input, ...@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
int block_size = blockDim.x; int block_size = blockDim.x;
int sequenceId = blockIdx.x; int sequenceId = blockIdx.x;
int seq_start = sequence[sequenceId]; int seq_start = sequence[sequenceId];
int seq_end = sequence[sequenceId+1]; int seq_end = sequence[sequenceId + 1];
real value = 0; real value = 0;
int instances = seq_end - seq_start + context_length - 1; int instances = seq_end - seq_start + context_length - 1;
...@@ -50,7 +50,8 @@ __global__ void KeContextProjectionForward(const real* input, ...@@ -50,7 +50,8 @@ __global__ void KeContextProjectionForward(const real* input,
if (padding) { if (padding) {
value = value =
weight[(begin_pad + i + context_start - (seq_end - seq_start)) * weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
input_dim + idx]; input_dim +
idx];
} else { } else {
continue; continue;
} }
...@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input, ...@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
dim3 grid(blocks_x, blocks_y); dim3 grid(blocks_x, blocks_y);
if (weight) { if (weight) {
KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>> KeContextProjectionForward<true><<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, sequence, weight, output, input_dim, input,
context_length, context_start, begin_pad); sequence,
weight,
output,
input_dim,
context_length,
context_start,
begin_pad);
} else { } else {
KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>> KeContextProjectionForward<false><<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, sequence, weight, output, input_dim, input,
context_length, context_start, begin_pad); sequence,
weight,
output,
input_dim,
context_length,
context_start,
begin_pad);
} }
CHECK_SYNC("hl_context_projection_forward failed"); CHECK_SYNC("hl_context_projection_forward failed");
} }
...@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad, ...@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
int block_size = blockDim.x; int block_size = blockDim.x;
int sequenceId = blockIdx.x; int sequenceId = blockIdx.x;
int seq_start = sequence[sequenceId]; int seq_start = sequence[sequenceId];
int seq_end = sequence[sequenceId+1]; int seq_end = sequence[sequenceId + 1];
real value = 0; real value = 0;
int instances = seq_end - seq_start + context_length - 1; int instances = seq_end - seq_start + context_length - 1;
...@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad, ...@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
int blocks_y = 1; int blocks_y = 1;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(blocks_x, blocks_y); dim3 grid(blocks_x, blocks_y);
KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>> KeContextProjectionBackwardData<<<grid, threads, 0, STREAM_DEFAULT>>>(
(out_grad, sequence, input_grad, input_dim, context_length, context_start); out_grad, sequence, input_grad, input_dim, context_length, context_start);
CHECK_SYNC("hl_context_projection_backward_data failed"); CHECK_SYNC("hl_context_projection_backward_data failed");
} }
...@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad, ...@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
context_start); context_start);
} }
template<int THREADS_X, int THREADS_Y> template <int THREADS_X, int THREADS_Y>
__global__ void KeContextProjectionBackwardWeight(const real* out_grad, __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
const int* sequence, const int* sequence,
real* w_grad, real* w_grad,
...@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, ...@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
if (weight_idx < w_dim) { if (weight_idx < w_dim) {
for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) { for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
int seq_start = sequence[seqId]; int seq_start = sequence[seqId];
int seq_end = sequence[seqId+1]; int seq_end = sequence[seqId + 1];
output_r = const_cast<real*>(out_grad) output_r =
+ seq_start * w_dim * context_length; const_cast<real*>(out_grad) + seq_start * w_dim * context_length;
if (context_start < 0) { if (context_start < 0) {
if (padId + context_start < 0) { if (padId + context_start < 0) {
instanceId = padId; instanceId = padId;
} else { } else {
// begin_pad > 0; // begin_pad > 0;
instanceId = (padId - begin_pad) + instanceId =
(seq_end - seq_start) - context_start; (padId - begin_pad) + (seq_end - seq_start) - context_start;
} }
} else { } else {
if (padId + (seq_end - seq_start) < context_start) { if (padId + (seq_end - seq_start) < context_start) {
...@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, ...@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
} }
} }
int outx = (instanceId - context_length) < 0 ? int outx =
instanceId : (context_length - 1); (instanceId - context_length) < 0 ? instanceId : (context_length - 1);
int outy = (instanceId - context_length) < 0 ? int outy = (instanceId - context_length) < 0
0 : (instanceId - (context_length - 1)); ? 0
: (instanceId - (context_length - 1));
output_r += outy * w_dim * context_length + outx * w_dim; output_r += outy * w_dim * context_length + outx * w_dim;
for (int j = outy; j < seq_end - seq_start; j++) { for (int j = outy; j < seq_end - seq_start; j++) {
value += output_r[weight_idx]; value += output_r[weight_idx];
...@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, ...@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
} }
__syncthreads(); __syncthreads();
for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) { for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) {
if (idy < stride) { if (idy < stride) {
sum_s[idy][idx] += sum_s[idy + stride][idx]; sum_s[idy][idx] += sum_s[idy + stride][idx];
} }
...@@ -339,16 +353,21 @@ void hl_context_projection_backward_weight(const real* out_grad, ...@@ -339,16 +353,21 @@ void hl_context_projection_backward_weight(const real* out_grad,
dim3 threads(threads_x, threads_y); dim3 threads(threads_x, threads_y);
dim3 grid(blocks_x, 1); dim3 grid(blocks_x, 1);
KeContextProjectionBackwardWeight<32, 32> KeContextProjectionBackwardWeight<32,
<<< grid, threads, 0, STREAM_DEFAULT >>> 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
(out_grad, sequence, w_grad, num_sequences, w_dim, out_grad,
context_length, context_start, begin_pad); sequence,
w_grad,
num_sequences,
w_dim,
context_length,
context_start,
begin_pad);
CHECK_SYNC("hl_context_projection_backward_weight failed"); CHECK_SYNC("hl_context_projection_backward_weight failed");
} }
template <> template <>
void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>( void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
const GpuMatrix& out_grad,
GpuMatrix& w_grad, GpuMatrix& w_grad,
const GpuIVector& seq_vec, const GpuIVector& seq_vec,
size_t context_length, size_t context_length,
...@@ -378,15 +397,10 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad, ...@@ -378,15 +397,10 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
size_t total_pad) { size_t total_pad) {
if (in_grad) { if (in_grad) {
ContextProjectionBackwardData<DEVICE_TYPE_GPU>( ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
out_grad, out_grad, in_grad, sequence, context_length, context_start);
in_grad,
sequence,
context_length,
context_start);
} }
if (is_padding && w_grad) { if (is_padding && w_grad) {
ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>( ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(out_grad,
out_grad,
w_grad, w_grad,
sequence, sequence,
context_length, context_length,
......
...@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "CosSimOp.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_device_functions.cuh" #include "hl_device_functions.cuh"
#include "CosSimOp.h"
namespace paddle { namespace paddle {
template<int block_size> template <int block_size>
__global__ void KeCosSim(real* output, __global__ void KeCosSim(real* output,
const real* input1, const real* input1,
const real* input2, const real* input2,
...@@ -78,8 +78,8 @@ void hlCossim(real* output, ...@@ -78,8 +78,8 @@ void hlCossim(real* output,
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(1, input1_height); dim3 grid(1, input1_height);
KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>> KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, input1, input2, width, input1_height, input2_height, scale); output, input1, input2, width, input1_height, input2_height, scale);
CHECK_SYNC("hlCossim failed"); CHECK_SYNC("hlCossim failed");
} }
...@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat, ...@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale); hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
} }
template<int block_size> template <int block_size>
__global__ void KeCosSimDerivative(const real* grad, __global__ void KeCosSimDerivative(const real* grad,
const real* output, const real* output,
const real* prev_out_x, const real* prev_out_x,
...@@ -148,13 +148,12 @@ __global__ void KeCosSimDerivative(const real* grad, ...@@ -148,13 +148,12 @@ __global__ void KeCosSimDerivative(const real* grad,
if (xy[0] == 0) { if (xy[0] == 0) {
real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0])); real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
for (int index = tid; index < width; index += block_size) { for (int index = tid; index < width; index += block_size) {
prev_grad_x[index] += prev_grad_x[index] += scale * grad[ty] * prev_out_y[index] * reciprocal;
scale * grad[ty] * prev_out_y[index] * reciprocal;
if (input2_height > 1) { if (input2_height > 1) {
prev_grad_y[index] += prev_grad_y[index] += scale * grad[ty] * prev_out_x[index] * reciprocal;
scale * grad[ty] * prev_out_x[index] * reciprocal;
} else { } else {
paddle::paddleAtomicAdd(prev_grad_y + index, paddle::paddleAtomicAdd(
prev_grad_y + index,
scale * grad[ty] * prev_out_x[index] * reciprocal); scale * grad[ty] * prev_out_x[index] * reciprocal);
} }
} }
...@@ -163,16 +162,17 @@ __global__ void KeCosSimDerivative(const real* grad, ...@@ -163,16 +162,17 @@ __global__ void KeCosSimDerivative(const real* grad,
real reciprocalSquareSumX = 1.0 / xx[0]; real reciprocalSquareSumX = 1.0 / xx[0];
real reciprocalSquareSumY = 1.0 / yy[0]; real reciprocalSquareSumY = 1.0 / yy[0];
for (int index = tid; index < width; index += block_size) { for (int index = tid; index < width; index += block_size) {
prev_grad_x[index] += output[ty] * grad[ty] * prev_grad_x[index] +=
(prev_out_y[index] * reciprocalXY - output[ty] * grad[ty] * (prev_out_y[index] * reciprocalXY -
prev_out_x[index] * reciprocalSquareSumX); prev_out_x[index] * reciprocalSquareSumX);
if (input2_height > 1) { if (input2_height > 1) {
prev_grad_y[index] += output[ty] * grad[ty] * prev_grad_y[index] +=
(prev_out_x[index] * reciprocalXY - output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
prev_out_y[index] * reciprocalSquareSumY); prev_out_y[index] * reciprocalSquareSumY);
} else { } else {
paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] * paddle::paddleAtomicAdd(
(prev_out_x[index] * reciprocalXY - prev_grad_y + index,
output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
prev_out_y[index] * reciprocalSquareSumY)); prev_out_y[index] * reciprocalSquareSumY));
} }
} }
...@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad, ...@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
const int block_size = 256; const int block_size = 256;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(1, input1_height); dim3 grid(1, input1_height);
KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>> KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width, grad,
input1_height, input2_height, scale); output,
prev_out_x,
prev_out_y,
prev_grad_x,
prev_grad_y,
width,
input1_height,
input2_height,
scale);
CHECK_SYNC("hlCossimDerivate failed"); CHECK_SYNC("hlCossimDerivate failed");
} }
...@@ -214,8 +222,8 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad, ...@@ -214,8 +222,8 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
real scale) { real scale) {
CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() && CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
in2_val.getData() && in1_grad.getData() && in2_grad.getData()); in2_val.getData() && in1_grad.getData() && in2_grad.getData());
CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ &&
&& in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_) in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
<< "Matrix types are not equally GPU"; << "Matrix types are not equally GPU";
size_t dim = in1_val.getWidth(); size_t dim = in1_val.getWidth();
......
...@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "CropOp.h" #include "CropOp.h"
#include "hl_base.h"
namespace paddle { namespace paddle {
__global__ void KeCrop(real* outputs, const real* inputs, __global__ void KeCrop(real* outputs,
int inC, int inH, int inW, const real* inputs,
int cropC, int cropH, int cropW, int inC,
int outC, int outH, int outW, int nthreads) { int inH,
int inW,
int cropC,
int cropH,
int cropW,
int outC,
int outH,
int outW,
int nthreads) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) { if (idx < nthreads) {
const int w = idx % outW; const int w = idx % outW;
...@@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs, ...@@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
int blockSize = 1024; int blockSize = 1024;
int gridSize = (nth + blockSize - 1) / blockSize; int gridSize = (nth + blockSize - 1) / blockSize;
KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
(outputs, inputs, inC, inH, inW, cropC, cropH, cropW, inputs,
outC, outH, outW, nth); inC,
inH,
inW,
cropC,
cropH,
cropW,
outC,
outH,
outW,
nth);
CHECK_SYNC("Crop"); CHECK_SYNC("Crop");
} }
__global__ void KeCropDiff(const real* inGrad, real* outGrad, __global__ void KeCropDiff(const real* inGrad,
int inC, int inH, int inW, real* outGrad,
int cropC, int cropH, int cropW, int inC,
int outC, int outH, int outW, int nthreads) { int inH,
int inW,
int cropC,
int cropH,
int cropW,
int outC,
int outH,
int outW,
int nthreads) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) { if (idx < nthreads) {
const int w = idx % inW; const int w = idx % inW;
...@@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad, ...@@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
int blockSize = 1024; int blockSize = 1024;
int gridSize = (nth + blockSize - 1) / blockSize; int gridSize = (nth + blockSize - 1) / blockSize;
KeCropDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCropDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
(inGrad, outGrad, inC, inH, inW, cropC, cropH, cropW, outGrad,
outC, outH, outW, nth); inC,
inH,
inW,
cropC,
cropH,
cropW,
outC,
outH,
outW,
nth);
CHECK_SYNC("CropGrad"); CHECK_SYNC("CropGrad");
} }
......
...@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "CrossMapNormalOp.h" #include "CrossMapNormalOp.h"
#include "hl_base.h"
namespace paddle { namespace paddle {
__global__ void KeCMRNormFillScale(size_t imageSize, const real* in, __global__ void KeCMRNormFillScale(size_t imageSize,
real* scale, size_t channels, const real* in,
size_t height, size_t width, size_t size, real* scale,
size_t channels,
size_t height,
size_t width,
size_t size,
real alpha) { real alpha) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < imageSize) { if (idx < imageSize) {
...@@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in, ...@@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
} }
} }
__global__ void KeCMRNormOutput(size_t inputSize, const real* in, __global__ void KeCMRNormOutput(size_t inputSize,
const real* scale, real negative_beta, const real* in,
const real* scale,
real negative_beta,
real* out) { real* out) {
const int index = threadIdx.x + blockIdx.x * blockDim.x; const int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < inputSize) { if (index < inputSize) {
...@@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs, ...@@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
size_t imageSize = numSamples * height * width; size_t imageSize = numSamples * height * width;
int blockSize = 1024; int blockSize = 1024;
int gridSize = (imageSize + 1024 - 1) / 1024; int gridSize = (imageSize + 1024 - 1) / 1024;
KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
(imageSize, inputs, denoms, channels, height, width, size, scale); imageSize, inputs, denoms, channels, height, width, size, scale);
size_t inputSize = numSamples * height * width *channels; size_t inputSize = numSamples * height * width * channels;
blockSize = 1024; blockSize = 1024;
gridSize = (inputSize + 1024 - 1) / 1024; gridSize = (inputSize + 1024 - 1) / 1024;
KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
(inputSize, inputs, denoms, -pow, outputs); inputSize, inputs, denoms, -pow, outputs);
CHECK_SYNC("CrossMapNormal"); CHECK_SYNC("CrossMapNormal");
} }
__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data, __global__ void KeCMRNormDiff(size_t imageSize,
const real* top_data, const real* scale, const real* bottom_data,
const real* top_diff, size_t channels, const real* top_data,
size_t height, size_t width, size_t size, const real* scale,
real negative_beta, real cache_ratio, const real* top_diff,
real* bottom_diff ) { size_t channels,
size_t height,
size_t width,
size_t size,
real negative_beta,
real cache_ratio,
real* bottom_diff) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < imageSize) { if (idx < imageSize) {
const int w = idx % width; const int w = idx % width;
...@@ -122,8 +134,8 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data, ...@@ -122,8 +134,8 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
if (index >= post_pad) { if (index >= post_pad) {
bottom_diff[(index - post_pad) * step] += bottom_diff[(index - post_pad) * step] +=
top_diff[(index - post_pad) * step] * top_diff[(index - post_pad) * step] *
pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio * pow(scale[(index - post_pad) * step], negative_beta) -
bottom_data[(index - post_pad) * step] * accum; cache_ratio * bottom_data[(index - post_pad) * step] * accum;
} }
++index; ++index;
} }
...@@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad, ...@@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
int blockSize = 1024; int blockSize = 1024;
int gridSize = (imageSize + 1024 - 1) / 1024; int gridSize = (imageSize + 1024 - 1) / 1024;
KeCMRNormDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCMRNormDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(imageSize,
(imageSize, inputsValue, outputsValue, denoms, outputsGrad, channels, inputsValue,
height, width, size, -pow, 2.0f * pow * scale, inputsGrad); outputsValue,
denoms,
outputsGrad,
channels,
height,
width,
size,
-pow,
2.0f * pow * scale,
inputsGrad);
CHECK_SYNC("CrossMapNormalGrad"); CHECK_SYNC("CrossMapNormalGrad");
} }
......
...@@ -20,17 +20,25 @@ namespace paddle { ...@@ -20,17 +20,25 @@ namespace paddle {
// CUDA kernel to compute the depthwise convolution forward pass // CUDA kernel to compute the depthwise convolution forward pass
template <class T> template <class T>
__global__ __global__ void ConvolutionDepthwiseForward(const int nthreads,
void ConvolutionDepthwiseForward(const int nthreads, const T* const inputData,
const T* const inputData, const T* const filterData, const T* const filterData,
const int batchSize, const int outputChannels, const int outputHeight, const int batchSize,
const int outputWidth, const int inputChannels, const int inputHeight, const int outputChannels,
const int inputWidth, const int filterMultiplier, const int filterHeight, const int outputHeight,
const int filterWidth, const int strideH, const int strideW, const int outputWidth,
const int paddingH, const int paddingW, T* const outputData) { const int inputChannels,
const int inputHeight,
int index = const int inputWidth,
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; const int filterMultiplier,
const int filterHeight,
const int filterWidth,
const int strideH,
const int strideW,
const int paddingH,
const int paddingW,
T* const outputData) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) { if (index < nthreads) {
const int batch = index / outputChannels / outputHeight / outputWidth; const int batch = index / outputChannels / outputHeight / outputWidth;
...@@ -45,14 +53,16 @@ void ConvolutionDepthwiseForward(const int nthreads, ...@@ -45,14 +53,16 @@ void ConvolutionDepthwiseForward(const int nthreads,
const int w_in_start = -paddingW + w_out * strideW; const int w_in_start = -paddingW + w_out * strideW;
const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1; const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1; const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
if ((h_in_start >= 0) && (h_in_end < inputHeight) if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) &&
&& (w_in_start >= 0) && (w_in_end < inputWidth)) { (w_in_end < inputWidth)) {
for (int kh = 0; kh < filterHeight; ++kh) { for (int kh = 0; kh < filterHeight; ++kh) {
for (int kw = 0; kw < filterWidth; ++kw) { for (int kw = 0; kw < filterWidth; ++kw) {
const int h_in = -paddingH + h_out * strideH + kh; const int h_in = -paddingH + h_out * strideH + kh;
const int w_in = -paddingW + w_out * strideW + kw; const int w_in = -paddingW + w_out * strideW + kw;
const int offset = ((batch * inputChannels + c_in) const int offset =
* inputHeight + h_in) * inputWidth + w_in; ((batch * inputChannels + c_in) * inputHeight + h_in) *
inputWidth +
w_in;
value += (*weight) * inputData[offset]; value += (*weight) * inputData[offset];
++weight; ++weight;
} }
...@@ -62,10 +72,12 @@ void ConvolutionDepthwiseForward(const int nthreads, ...@@ -62,10 +72,12 @@ void ConvolutionDepthwiseForward(const int nthreads,
for (int kw = 0; kw < filterWidth; ++kw) { for (int kw = 0; kw < filterWidth; ++kw) {
const int h_in = -paddingH + h_out * strideH + kh; const int h_in = -paddingH + h_out * strideH + kh;
const int w_in = -paddingW + w_out * strideW + kw; const int w_in = -paddingW + w_out * strideW + kw;
if ((h_in >= 0) && (h_in < inputHeight) if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
&& (w_in >= 0) && (w_in < inputWidth)) { (w_in < inputWidth)) {
const int offset = ((batch * inputChannels + c_in) const int offset =
* inputHeight + h_in) * inputWidth + w_in; ((batch * inputChannels + c_in) * inputHeight + h_in) *
inputWidth +
w_in;
value += (*weight) * inputData[offset]; value += (*weight) * inputData[offset];
} }
++weight; ++weight;
...@@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads, ...@@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads,
// CUDA kernel to compute the depthwise convolution backprop w.r.t input. // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template <class T> template <class T>
__global__ __global__ void ConvolutionDepthwiseInputBackward(const int nthreads,
void ConvolutionDepthwiseInputBackward(const int nthreads, const T* const top_diff,
const T* const top_diff, const T* const weight_data, const T* const weight_data,
const int num, const int outputChannels, const int outputHeight, const int num,
const int outputWidth, const int inputChannels, const int inputHeight, const int outputChannels,
const int inputWidth, const int filterMultiplier, const int filterHeight, const int outputHeight,
const int filterWidth, const int strideH, const int strideW, const int outputWidth,
const int paddingH, const int paddingW, T* const bottom_diff) { const int inputChannels,
int index = const int inputHeight,
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; const int inputWidth,
const int filterMultiplier,
const int filterHeight,
const int filterWidth,
const int strideH,
const int strideW,
const int paddingH,
const int paddingW,
T* const bottom_diff) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) { if (index < nthreads) {
const int batch = index / inputChannels / inputHeight / inputWidth; const int batch = index / inputChannels / inputHeight / inputWidth;
const int c_in = (index / inputHeight / inputWidth) % inputChannels; const int c_in = (index / inputHeight / inputWidth) % inputChannels;
...@@ -96,27 +117,29 @@ void ConvolutionDepthwiseInputBackward(const int nthreads, ...@@ -96,27 +117,29 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
const int c_out_start = c_in * filterMultiplier; const int c_out_start = c_in * filterMultiplier;
int h_out_start = (h_in - filterHeight + paddingH + strideH)/strideH; int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
h_out_start = 0 > h_out_start ? 0 : h_out_start; h_out_start = 0 > h_out_start ? 0 : h_out_start;
int h_out_end = (h_in + paddingH)/strideH; int h_out_end = (h_in + paddingH) / strideH;
h_out_end = outputHeight - 1 < h_out_end? outputHeight - 1 : h_out_end; h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
int w_out_start = (w_in - filterWidth + paddingW + strideW)/strideW; int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
w_out_start = 0 > w_out_start ? 0 : w_out_start; w_out_start = 0 > w_out_start ? 0 : w_out_start;
int w_out_end = (w_in + paddingW)/strideW; int w_out_end = (w_in + paddingW) / strideW;
w_out_end = outputWidth - 1 < w_out_end? outputWidth - 1 : w_out_end; w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
T value = 0; T value = 0;
for (int c_out = c_out_start; for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
c_out < c_out_start + filterMultiplier; c_out ++) { c_out++) {
for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
const int filter_h = h_in + paddingH - h_out * strideH; const int filter_h = h_in + paddingH - h_out * strideH;
for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
const int filter_w = w_in + paddingW - w_out * strideW; const int filter_w = w_in + paddingW - w_out * strideW;
const int filter_offset = c_out * filterHeight * filterWidth const int filter_offset = c_out * filterHeight * filterWidth +
+ filter_h * filterWidth + filter_w; filter_h * filterWidth + filter_w;
const int top_diff_offset = ((batch * outputChannels + c_out) * const int top_diff_offset =
outputHeight + h_out)* outputWidth + w_out; ((batch * outputChannels + c_out) * outputHeight + h_out) *
outputWidth +
w_out;
value += top_diff[top_diff_offset] * weight_data[filter_offset]; value += top_diff[top_diff_offset] * weight_data[filter_offset];
} }
} }
...@@ -127,34 +150,47 @@ void ConvolutionDepthwiseInputBackward(const int nthreads, ...@@ -127,34 +150,47 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
// CUDA kernel to compute the depthwise convolution backprop w.r.t filter. // CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
template <class T> template <class T>
__global__ __global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads, const int nthreads,
const T* const top_diff, const T* const inputData, const T* const top_diff,
const int num, const int outputChannels, const int outputHeight, const T* const inputData,
const int outputWidth, const int inputChannels, const int inputHeight, const int num,
const int inputWidth, const int filterMultiplier, const int filterHeight, const int outputChannels,
const int filterWidth, const int strideH, const int strideW, const int outputHeight,
const int paddingH, const int paddingW, T* const buffer_data) { const int outputWidth,
int index = const int inputChannels,
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; const int inputHeight,
const int inputWidth,
const int filterMultiplier,
const int filterHeight,
const int filterWidth,
const int strideH,
const int strideW,
const int paddingH,
const int paddingW,
T* const buffer_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) { if (index < nthreads) {
const int h_out = (index / outputWidth) % outputHeight; const int h_out = (index / outputWidth) % outputHeight;
const int w_out = index % outputWidth; const int w_out = index % outputWidth;
const int kh = (index / filterWidth / outputHeight / outputWidth) const int kh =
% filterHeight; (index / filterWidth / outputHeight / outputWidth) % filterHeight;
const int kw = (index / outputHeight / outputWidth) % filterWidth; const int kw = (index / outputHeight / outputWidth) % filterWidth;
const int h_in = -paddingH + h_out * strideH + kh; const int h_in = -paddingH + h_out * strideH + kh;
const int w_in = -paddingW + w_out * strideW + kw; const int w_in = -paddingW + w_out * strideW + kw;
if ((h_in >= 0) && (h_in < inputHeight) if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
&& (w_in >= 0) && (w_in < inputWidth)) { (w_in < inputWidth)) {
const int c_out = index / const int c_out =
(filterHeight * filterWidth * outputHeight * outputWidth); index / (filterHeight * filterWidth * outputHeight * outputWidth);
const int c_in = c_out / filterMultiplier; const int c_in = c_out / filterMultiplier;
const int batch = num_i; const int batch = num_i;
const int top_offset = ((batch * outputChannels + c_out) * const int top_offset =
outputHeight + h_out) * outputWidth + w_out; ((batch * outputChannels + c_out) * outputHeight + h_out) *
const int bottom_offset = ((batch * inputChannels + c_in) outputWidth +
* inputHeight + h_in) * inputWidth + w_in; w_out;
const int bottom_offset =
((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
w_in;
buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset]; buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
} else { } else {
buffer_data[index] = 0; buffer_data[index] = 0;
...@@ -163,7 +199,7 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads, ...@@ -163,7 +199,7 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
} }
template <class T> template <class T>
class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T>{ class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
public: public:
void operator()(const T* inputData, void operator()(const T* inputData,
const T* filterData, const T* filterData,
...@@ -181,17 +217,16 @@ public: ...@@ -181,17 +217,16 @@ public:
int strideW, int strideW,
int paddingH, int paddingH,
int paddingW, int paddingW,
T* outputData){ T* outputData) {
int outputSize = batchSize * outputChannels * outputHeight * outputWidth; int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
size_t blocks = (outputSize + 1024 -1) / 1024; size_t blocks = (outputSize + 1024 - 1) / 1024;
size_t blockX = 512; size_t blockX = 512;
size_t blockY = (blocks+512-1)/512; size_t blockY = (blocks + 512 - 1) / 512;
dim3 threads(1024, 1); dim3 threads(1024, 1);
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
ConvolutionDepthwiseForward<T> ConvolutionDepthwiseForward<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
<<< grid, threads, 0, STREAM_DEFAULT >>>(
outputSize, outputSize,
inputData, inputData,
filterData, filterData,
...@@ -214,7 +249,7 @@ public: ...@@ -214,7 +249,7 @@ public:
}; };
template <class T> template <class T>
class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T>{ class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
public: public:
void operator()(const T* outputGrad, void operator()(const T* outputGrad,
const T* filterData, const T* filterData,
...@@ -232,20 +267,18 @@ public: ...@@ -232,20 +267,18 @@ public:
int strideW, int strideW,
int paddingH, int paddingH,
int paddingW, int paddingW,
T* inputGrad){ T* inputGrad) {
int inputSize = batchSize * inputChannels * inputHeight * inputWidth; int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
size_t blocks = (inputSize + 1024 -1) / 1024; size_t blocks = (inputSize + 1024 - 1) / 1024;
size_t blockX = 512; size_t blockX = 512;
size_t blockY = (blocks+512-1)/512; size_t blockY = (blocks + 512 - 1) / 512;
dim3 threads(1024, 1); dim3 threads(1024, 1);
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
ConvolutionDepthwiseInputBackward<T> ConvolutionDepthwiseInputBackward<T>
// NOLINT_NEXT_LINE(whitespace/operators) // NOLINT_NEXT_LINE(whitespace/operators)
<<< grid, threads, 0, STREAM_DEFAULT >>>( <<<grid, threads, 0, STREAM_DEFAULT>>>(inputSize,
inputSize,
outputGrad, outputGrad,
filterData, filterData,
batchSize, batchSize,
...@@ -286,22 +319,24 @@ public: ...@@ -286,22 +319,24 @@ public:
int paddingH, int paddingH,
int paddingW, int paddingW,
T* colData, T* colData,
T* filterGrad){ T* filterGrad) {
int colDataSize = outputChannels * filterHeight * filterWidth int colDataSize = outputChannels * filterHeight * filterWidth *
* outputHeight * outputWidth; outputHeight * outputWidth;
size_t blocks = (colDataSize + 1024 -1) / 1024; size_t blocks = (colDataSize + 1024 - 1) / 1024;
size_t blockX = 512; size_t blockX = 512;
size_t blockY = (blocks+512-1)/512; size_t blockY = (blocks + 512 - 1) / 512;
dim3 threads(1024, 1); dim3 threads(1024, 1);
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth, BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
1, filterGrad, false, true); 1,
filterGrad,
false,
true);
for (int i = 0; i < batchSize; i++) { for (int i = 0; i < batchSize; i++) {
ConvolutionDepthwiseFilterBackward<T> ConvolutionDepthwiseFilterBackward<
<<< grid, threads, 0, STREAM_DEFAULT >>>( T><<<grid, threads, 0, STREAM_DEFAULT>>>(i,
i,
colDataSize, colDataSize,
outputGrad, outputGrad,
inputData, inputData,
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册