diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2ca988c406ae2987e26ca37dbc17cc0a2af43743..bb8c88787d37faf9ce4d7d856a307c11f1085d98 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,7 +24,7 @@ description: Format files with ClangFormat. entry: clang-format -i language: system - files: \.(c|cc|cxx|cpp|h|hpp|hxx)$ + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ - repo: https://github.com/PaddlePaddle/pre-commit-golang sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 hooks: diff --git a/paddle/cuda/src/hl_batch_transpose.cu b/paddle/cuda/src/hl_batch_transpose.cu index f047403da17e66960f029f2fee7312210009c952..f4c253df7b4be937f041f18587efd4c9d693fbe4 100644 --- a/paddle/cuda/src/hl_batch_transpose.cu +++ b/paddle/cuda/src/hl_batch_transpose.cu @@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_batch_transpose.h" #include "hl_base.h" +#include "hl_batch_transpose.h" const int TILE_DIM = 64; const int BLOCK_ROWS = 16; // No bank-conflict transpose for a batch of data. -__global__ void batchTransposeNoBankConflicts(real* odata, - const real* idata, - int numSamples, int width, - int height) { +__global__ void batchTransposeNoBankConflicts( + real* odata, const real* idata, int numSamples, int width, int height) { __shared__ float tile[TILE_DIM][TILE_DIM + 1]; const int x = blockIdx.x * TILE_DIM + threadIdx.x; @@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata, newX] = tile[threadIdx.x][j]; } -void batchTranspose(const real* input, real* output, int width, int height, - int batchSize) { +void batchTranspose( + const real* input, real* output, int width, int height, int batchSize) { dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize); - batchTransposeNoBankConflicts<<>> - (output, input, batchSize, width, height); + batchTransposeNoBankConflicts<<>>( + output, input, batchSize, width, height); CHECK_SYNC("batchTranspose failed!"); } diff --git a/paddle/cuda/src/hl_cuda_aggregate.cu b/paddle/cuda/src/hl_cuda_aggregate.cu index 97034a917708487d1c5dc59e6ebbf45bad1c3227..16a54ad343fa140aa1f3bec311c4b712d0086082 100644 --- a/paddle/cuda/src/hl_cuda_aggregate.cu +++ b/paddle/cuda/src/hl_cuda_aggregate.cu @@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - +#include "hl_aggregate.h" #include "hl_base.h" #include "hl_cuda.h" #include "hl_cuda.ph" -#include "hl_aggregate.h" -#include "hl_thread.ph" #include "hl_matrix_base.cuh" +#include "hl_thread.ph" #include "paddle/utils/Logging.h" /** * @brief matrix row operator. */ -template -__global__ void KeMatrixRowOp(Agg agg, - real *E, - real *Sum, - int dimN) { +template +__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) { __shared__ real sum_s[blockSize]; - int cnt = (dimN + blockSize -1) / blockSize; - int rowId = blockIdx.x + blockIdx.y*gridDim.x; - int index = rowId*dimN; + int cnt = (dimN + blockSize - 1) / blockSize; + int rowId = blockIdx.x + blockIdx.y * gridDim.x; + int index = rowId * dimN; int tid = threadIdx.x; int lmt = tid; @@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg, sum_s[tid] = tmp; __syncthreads(); - for (int stride = blockSize/2; stride > 0; stride = stride/2) { + for (int stride = blockSize / 2; stride > 0; stride = stride / 2) { if (tid < stride) { sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]); } @@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg, } template -void hl_matrix_row_op(Agg agg, - real *A_d, - real *C_d, - int dimM, - int dimN) { +void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) { int blocksX = dimM; int blocksY = 1; dim3 threads(128, 1); dim3 grid(blocksX, blocksY); - KeMatrixRowOp<<< grid, threads, 0, STREAM_DEFAULT >>> - (agg, A_d, C_d, dimN); + KeMatrixRowOp<<>>( + agg, A_d, C_d, dimN); } void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_row_op(aggregate::sum(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_row_sum failed"); } @@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_row_op(aggregate::max(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_row_max failed"); } @@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_row_op(aggregate::min(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_row_min failed"); } /** * @brief matrix column operator. */ -template -__global__ void KeMatrixColumnOp(Agg agg, - real *E, - real *Sum, - int dimM, - int dimN) { +template +__global__ void KeMatrixColumnOp( + Agg agg, real *E, real *Sum, int dimM, int dimN) { int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; real tmp = agg.init(); if (rowIdx < dimN) { @@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg, } } -template -__global__ void KeMatrixColumnOp_S(Agg agg, - real *E, - real *Sum, - int dimM, - int dimN) { - __shared__ real _sum[blockDimX*blockDimY]; - int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; - int index = threadIdx.y; +template +__global__ void KeMatrixColumnOp_S( + Agg agg, real *E, real *Sum, int dimM, int dimN) { + __shared__ real _sum[blockDimX * blockDimY]; + int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; + int index = threadIdx.y; real tmp = agg.init(); if (rowIdx < dimN) { @@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg, index += blockDimY; } } - _sum[threadIdx.x + threadIdx.y*blockDimX] = tmp; + _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp; __syncthreads(); if (rowIdx < dimN) { - if (threadIdx.y ==0) { + if (threadIdx.y == 0) { real tmp = agg.init(); - for (int i=0; i < blockDimY; i++) { - tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]); + for (int i = 0; i < blockDimY; i++) { + tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]); } Sum[rowIdx] = tmp; } @@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg, } template -void hl_matrix_column_op(Agg agg, - real *A_d, - real *C_d, - int dimM, - int dimN) { +void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) { if (dimN >= 8192) { - int blocksX = (dimN + 128 -1) / 128; + int blocksX = (dimN + 128 - 1) / 128; int blocksY = 1; dim3 threads(128, 1); dim3 grid(blocksX, blocksY); - KeMatrixColumnOp<<< grid, threads, 0, STREAM_DEFAULT >>> - (agg, A_d, C_d, dimM, dimN); + KeMatrixColumnOp<<>>( + agg, A_d, C_d, dimM, dimN); } else { - int blocksX = (dimN + 32 -1) / 32; + int blocksX = (dimN + 32 - 1) / 32; int blocksY = 1; dim3 threads(32, 32); dim3 grid(blocksX, blocksY); - KeMatrixColumnOp_S<<< grid, threads, 0, STREAM_DEFAULT>>> - (agg, A_d, C_d, dimM, dimN); + KeMatrixColumnOp_S<<>>( + agg, A_d, C_d, dimM, dimN); } return; @@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_column_op(aggregate::sum(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_column_sum failed"); } @@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_column_op(aggregate::max(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_column_max failed"); } @@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); - hl_matrix_column_op(aggregate::min(), - A_d, - C_d, - dimM, - dimN); + hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN); CHECK_SYNC("hl_matrix_column_min failed"); } @@ -226,16 +184,16 @@ template __global__ void KeVectorSum(real *E, real *Sum, int dimM) { __shared__ double sum_s[blockSize]; int tid = threadIdx.x; - int index = blockIdx.y*blockDim.x+threadIdx.x; + int index = blockIdx.y * blockDim.x + threadIdx.x; sum_s[tid] = 0.0f; while (index < dimM) { sum_s[tid] += E[index]; - index += blockDim.x*gridDim.y; + index += blockDim.x * gridDim.y; } __syncthreads(); - for (int stride = blockSize/2; stride > 0; stride = stride/2) { + for (int stride = blockSize / 2; stride > 0; stride = stride / 2) { if (tid < stride) { sum_s[tid] += sum_s[tid + stride]; } @@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) { dim3 threads(blockSize, 1); dim3 grid(blocksX, blocksY); - struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; + struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; hl_event_t hl_event = &hl_event_st; - while (!hl_cuda_event_is_ready(hl_event)) {} + while (!hl_cuda_event_is_ready(hl_event)) { + } - KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> - (A_d, t_resource.gpu_mem, dimM); - KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> - (t_resource.gpu_mem, t_resource.cpu_mem, 128); + KeVectorSum<128><<>>( + A_d, t_resource.gpu_mem, dimM); + KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>( + t_resource.gpu_mem, t_resource.cpu_mem, 128); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_synchronize(HPPL_STREAM_DEFAULT); cudaError_t err = (cudaError_t)hl_get_device_last_error(); - CHECK_EQ(cudaSuccess, err) - << "CUDA error: " << hl_get_device_error_string((size_t)err); + CHECK_EQ(cudaSuccess, err) << "CUDA error: " + << hl_get_device_error_string((size_t)err); } template __global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) { __shared__ double sum_s[blockSize]; int tid = threadIdx.x; - int index = blockIdx.y*blockDim.x+threadIdx.x; + int index = blockIdx.y * blockDim.x + threadIdx.x; sum_s[tid] = 0.0f; while (index < dimM) { sum_s[tid] += abs(E[index]); - index += blockDim.x*gridDim.y; + index += blockDim.x * gridDim.y; } __syncthreads(); - for (int stride = blockSize/2; stride > 0; stride = stride/2) { + for (int stride = blockSize / 2; stride > 0; stride = stride / 2) { if (tid < stride) { sum_s[tid] += sum_s[tid + stride]; } @@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) { dim3 threads(blockSize, 1); dim3 grid(blocksX, blocksY); - struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; + struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; hl_event_t hl_event = &hl_event_st; - while (!hl_cuda_event_is_ready(hl_event)) {} + while (!hl_cuda_event_is_ready(hl_event)) { + } - KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> - (A_d, t_resource.gpu_mem, dimM); - KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> - (t_resource.gpu_mem, t_resource.cpu_mem, 128); + KeVectorAbsSum<128><<>>( + A_d, t_resource.gpu_mem, dimM); + KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>( + t_resource.gpu_mem, t_resource.cpu_mem, 128); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_synchronize(HPPL_STREAM_DEFAULT); cudaError_t err = (cudaError_t)hl_get_device_last_error(); - CHECK_EQ(cudaSuccess, err) - << "CUDA error: " << hl_get_device_error_string((size_t)err); + CHECK_EQ(cudaSuccess, err) << "CUDA error: " + << hl_get_device_error_string((size_t)err); } diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu index b6e3e63a4f52261e49467bd82fdabd063e81460e..aac19b1ea566ad69f1f7374e393676c8debd9883 100644 --- a/paddle/cuda/src/hl_cuda_cnn.cu +++ b/paddle/cuda/src/hl_cuda_cnn.cu @@ -12,21 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include "hl_base.h" #include "hl_cnn.h" #include "hl_device_functions.cuh" -__global__ void KeMaxPoolForward(const int nthreads, const real* inputData, - const int channels, const int height, +__global__ void KeMaxPoolForward(const int nthreads, + const real* inputData, + const int channels, + const int height, const int width, - const int pooledH, const int pooledW, - const int ksizeW, const int ksizeH, - const int strideH, const int strideW, - const int offsetH, const int offsetW, - real* tgtData, const int tgtStride) { - int index = blockIdx.x * blockDim.x + threadIdx.x; + const int pooledH, + const int pooledW, + const int ksizeW, + const int ksizeH, + const int strideH, + const int strideW, + const int offsetH, + const int offsetW, + real* tgtData, + const int tgtStride) { + int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { int pw = index % pooledW; int ph = (index / pooledW) % pooledH; @@ -46,44 +52,70 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData, maxval = inputData[h * width + w]; } } - int tgtIndex = index % (pooledW * pooledH * channels) + - frameNum * tgtStride; + int tgtIndex = + index % (pooledW * pooledH * channels) + frameNum * tgtStride; tgtData[tgtIndex] = maxval; } } -void hl_maxpool_forward(const int frameCnt, const real* inputData, +void hl_maxpool_forward(const int frameCnt, + const real* inputData, const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real* tgtData, const int tgtStride) { - + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real* tgtData, + const int tgtStride) { int num_kernels = pooledH * pooledW * channels * frameCnt; int blocks = (num_kernels + 1024 - 1) / 1024; dim3 threads(1024, 1); dim3 grid(blocks, 1); - KeMaxPoolForward<<< grid, threads, 0, STREAM_DEFAULT >>> - (num_kernels, inputData, channels, height, width, - pooledH, pooledW, sizeX, sizeY, strideH, strideW, - paddingH, paddingW, tgtData, tgtStride); + KeMaxPoolForward<<>>(num_kernels, + inputData, + channels, + height, + width, + pooledH, + pooledW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + tgtData, + tgtStride); CHECK_SYNC("hl_maxpool_forward failed"); } -__global__ void KeMaxPoolBackward(const int nthreads, const real* inputData, - const real* outData, const real* outGrad, - const int channels, const int height, +__global__ void KeMaxPoolBackward(const int nthreads, + const real* inputData, + const real* outData, + const real* outGrad, + const int channels, + const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int padH, const int padW, - real scaleA, real scaleB, - real* targetGrad, const int outStride) { - int index = blockIdx.x * blockDim.x + threadIdx.x; + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int padH, + const int padW, + real scaleA, + real scaleB, + real* targetGrad, + const int outStride) { + int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { // find out the local index // find out the local offset @@ -107,43 +139,69 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData, } } } - targetGrad[index] = - scaleB * targetGrad[index] + scaleA * gradient; + targetGrad[index] = scaleB * targetGrad[index] + scaleA * gradient; } } -void hl_maxpool_backward(const int frameCnt, const real* inputData, - const real* outData, const real* outGrad, - const int channels, const int height, - const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real scaleA, real scaleB, - real* targetGrad, const int outStride) { - +void hl_maxpool_backward(const int frameCnt, + const real* inputData, + const real* outData, + const real* outGrad, + const int channels, + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real scaleA, + real scaleB, + real* targetGrad, + const int outStride) { int num_kernels = height * width * channels * frameCnt; int blocks = (num_kernels + 1024 - 1) / 1024; - KeMaxPoolBackward<<< blocks, 1024, 0, STREAM_DEFAULT >>> - (num_kernels, inputData, outData, outGrad, channels, - height, width, pooledH, pooledW, sizeX, sizeY, - strideH, strideW, - paddingH, paddingW, - scaleA, scaleB, - targetGrad, outStride); + KeMaxPoolBackward<<>>(num_kernels, + inputData, + outData, + outGrad, + channels, + height, + width, + pooledH, + pooledW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + scaleA, + scaleB, + targetGrad, + outStride); CHECK_SYNC("hl_maxpool_backward"); } -__global__ void KeAvgPoolForward(const int nthreads, const real* inputData, +__global__ void KeAvgPoolForward(const int nthreads, + const real* inputData, const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int padH, const int padW, - real* tgtData, const int tgtStride) { + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int padH, + const int padW, + real* tgtData, + const int tgtStride) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { int pw = index % pooledW; @@ -168,39 +226,64 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData, aveval += inputData[h * width + w]; } } - int tgtIndex = index % (pooledW * pooledH * channels) + - frameNum * tgtStride; + int tgtIndex = + index % (pooledW * pooledH * channels) + frameNum * tgtStride; tgtData[tgtIndex] = aveval / pool_size; } } -void hl_avgpool_forward(const int frameCnt, const real* inputData, +void hl_avgpool_forward(const int frameCnt, + const real* inputData, const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real* tgtData, const int tgtStride) { + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real* tgtData, + const int tgtStride) { int num_kernels = pooledH * pooledW * channels * frameCnt; int blocks = (num_kernels + 1024 - 1) / 1024; - KeAvgPoolForward<<< blocks, 1024, 0, STREAM_DEFAULT >>> - (num_kernels, inputData, channels, - height, width, pooledH, pooledW, - sizeX, sizeY, strideH, strideW, - paddingH, paddingW, tgtData, tgtStride); + KeAvgPoolForward<<>>(num_kernels, + inputData, + channels, + height, + width, + pooledH, + pooledW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + tgtData, + tgtStride); CHECK_SYNC("hl_avgpool_forward failed"); } -__global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad, - const int channels, const int height, +__global__ void KeAvgPoolBackward(const int nthreads, + const real* outGrad, + const int channels, + const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int padH, const int padW, - real scaleA, real scaleB, - real* tgtGrad, const int outStride) { + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int padH, + const int padW, + real scaleA, + real scaleB, + real* tgtGrad, + const int outStride) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { int offsetW = index % width + padW; @@ -215,7 +298,6 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad, real gradient = 0; outGrad += (frameNum * outStride + offsetC * pooledH * pooledW); - for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { // figure out the pooling size @@ -224,32 +306,50 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad, int hend = min(hstart + sizeY, height + padH); int wend = min(wstart + sizeX, width + padW); int poolsize = (hend - hstart) * (wend - wstart); - gradient += outGrad[ph * pooledW + pw]/poolsize; + gradient += outGrad[ph * pooledW + pw] / poolsize; } } tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient; } } -void hl_avgpool_backward(const int frameCnt, const real* outGrad, +void hl_avgpool_backward(const int frameCnt, + const real* outGrad, const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real scaleA, real scaleB, - real* backGrad, const int outStride) { + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real scaleA, + real scaleB, + real* backGrad, + const int outStride) { int num_kernels = height * width * channels * frameCnt; int blocks = (num_kernels + 1024 - 1) / 1024; - KeAvgPoolBackward <<< blocks, 1024, 0, STREAM_DEFAULT >>> - (num_kernels, outGrad, channels, height, width, - pooledH, pooledW, sizeX, sizeY, - strideH, strideW, - paddingH, paddingW, - scaleA, scaleB, - backGrad, outStride); + KeAvgPoolBackward<<>>(num_kernels, + outGrad, + channels, + height, + width, + pooledH, + pooledW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + scaleA, + scaleB, + backGrad, + outStride); CHECK_SYNC("hl_avgpool_backward failed"); } @@ -266,7 +366,7 @@ __global__ void KeBilinearInterpFw(const real* in, const size_t numChannels, const real ratioH, const real ratioW) { - int nthreads = outputH * outputW; + int nthreads = outputH * outputW; int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < nthreads) { int outIdH = tid / outputW; @@ -287,13 +387,14 @@ __global__ void KeBilinearInterpFw(const real* in, real w1lambda = ratioW * outImgIdx - inImgIdx; real w2lambda = 1.f - w1lambda; - const real* inPos = - &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx]; + const real* inPos = &in[outIdH * inputW + channelId * inImgSize + + inImgIdy * inImgW + inImgIdx]; // bilinear interpolation out[outIdH * outputW + outIdW] = - h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) + - h1lambda * (w2lambda * inPos[hId * inImgW] + w1lambda * inPos[hId * inImgW + wId]); + h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) + + h1lambda * (w2lambda * inPos[hId * inImgW] + + w1lambda * inPos[hId * inImgW + wId]); } } @@ -313,9 +414,19 @@ void hl_bilinear_forward(const real* inData, int threadNum = outputH * outputW; int blocks = (threadNum + 1024 - 1) / 1024; - KeBilinearInterpFw<<< blocks, 1024, 0, STREAM_DEFAULT>>>( - inData, inImgH, inImgW, inputH, inputW, outData, outImgH, - outImgW, outputH, outputW, numChannels, ratioH, ratioW); + KeBilinearInterpFw<<>>(inData, + inImgH, + inImgW, + inputH, + inputW, + outData, + outImgH, + outImgW, + outputH, + outputW, + numChannels, + ratioH, + ratioW); CHECK_SYNC("hl_bilinear_forward failed"); } @@ -353,13 +464,15 @@ __global__ void KeBilinearInterpBw(real* in, real w1lambda = ratioW * outImgIdx - inImgIdx; real w2lambda = 1.f - w1lambda; - real* inPos = - &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx]; + real* inPos = &in[outIdH * inputW + channelId * inImgSize + + inImgIdy * inImgW + inImgIdx]; const real* outPos = &out[outIdH * outputW + outIdW]; paddle::paddleAtomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]); paddle::paddleAtomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]); - paddle::paddleAtomicAdd(&inPos[hId * inImgW], h1lambda * w2lambda * outPos[0]); - paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId], h1lambda * w1lambda * outPos[0]); + paddle::paddleAtomicAdd(&inPos[hId * inImgW], + h1lambda * w2lambda * outPos[0]); + paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId], + h1lambda * w1lambda * outPos[0]); } } @@ -379,22 +492,37 @@ void hl_bilinear_backward(real* inGrad, int threadNum = outputH * outputW; int blocks = (threadNum + 1024 - 1) / 1024; - KeBilinearInterpBw<<< blocks, 1024, 0, STREAM_DEFAULT>>>( - inGrad, inImgH, inImgW, inputH, inputW, outGrad, outImgH, - outImgW, outputH, outputW, numChannels, ratioH, ratioW); + KeBilinearInterpBw<<>>(inGrad, + inImgH, + inImgW, + inputH, + inputW, + outGrad, + outImgH, + outImgW, + outputH, + outputW, + numChannels, + ratioH, + ratioW); CHECK_SYNC("hl_bilinear_backward failed"); } -__global__ void maxoutFpCompute(size_t nthreads, const real * inData, - real * outData, int* idData, - size_t size, size_t featLen, size_t groups) { +__global__ void maxoutFpCompute(size_t nthreads, + const real* inData, + real* outData, + int* idData, + size_t size, + size_t featLen, + size_t groups) { int index = blockIdx.x * blockDim.x + threadIdx.x; - if(index < nthreads) { + if (index < nthreads) { size_t batch_idx = index / size; size_t i = index % size; size_t channel_idx = i / featLen; size_t feat_idx = i % featLen; - size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx; + size_t data_idx = + (batch_idx * size + channel_idx * featLen) * groups + feat_idx; real max = inData[data_idx]; int maxId = 0; for (size_t g = 1; g < groups; ++g) { @@ -409,37 +537,50 @@ __global__ void maxoutFpCompute(size_t nthreads, const real * inData, } } -void hl_maxout_forward(const real* inData, real* outData, - int* idData, size_t batchSize, size_t size, - size_t featLen, size_t groups) { +void hl_maxout_forward(const real* inData, + real* outData, + int* idData, + size_t batchSize, + size_t size, + size_t featLen, + size_t groups) { int num_kernels = size * batchSize; int blocks = (num_kernels + 1024 - 1) / 1024; - maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>( - num_kernels, inData, outData, idData, size, featLen, groups); + maxoutFpCompute<<>>( + num_kernels, inData, outData, idData, size, featLen, groups); CHECK_SYNC("hl_maxout_forward failed"); } -__global__ void maxoutBpCompute(size_t nthreads, real* inGrad, - const real* outGrad, const int* idData, - size_t size, size_t featLen, size_t groups) { +__global__ void maxoutBpCompute(size_t nthreads, + real* inGrad, + const real* outGrad, + const int* idData, + size_t size, + size_t featLen, + size_t groups) { int index = blockIdx.x * blockDim.x + threadIdx.x; - if(index < nthreads) { + if (index < nthreads) { size_t batch_idx = index / size; size_t i = index % size; size_t channel_idx = i / featLen; size_t feat_idx = i % featLen; size_t newIndex = batch_idx * size; - size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx; + size_t gradIdx = + (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx; (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i]; } } -void hl_maxout_backward(real* inGrad, const real* outGrad, - const int* idData, size_t batchSize, size_t size, - size_t featLen, size_t groups) { +void hl_maxout_backward(real* inGrad, + const real* outGrad, + const int* idData, + size_t batchSize, + size_t size, + size_t featLen, + size_t groups) { int num_kernels = size * batchSize; int blocks = (num_kernels + 1024 - 1) / 1024; - maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>( - num_kernels, inGrad, outGrad, idData, size, featLen, groups); + maxoutBpCompute<<>>( + num_kernels, inGrad, outGrad, idData, size, featLen, groups); CHECK_SYNC("hl_maxout_backward failed"); } diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu index b869d903ba3cfb188f823518ba8ee7d17f9b2440..a5ce81a904ebbd655a16ef68660b81d442478575 100644 --- a/paddle/cuda/src/hl_cuda_lstm.cu +++ b/paddle/cuda/src/hl_cuda_lstm.cu @@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - +#include "hl_activation_functions.h" #include "hl_base.h" #include "hl_cuda_cublas.h" #include "hl_device_functions.cuh" -#include "hl_activation_functions.h" #include "paddle/utils/Logging.h" -typedef hppl::Active::forward t_forward; +typedef hppl::Active::forward t_forward; typedef hppl::Active::backward t_backward; bool hl_lstm_sequence_parallel(int frameSize) { @@ -42,9 +41,9 @@ public: value_ += (start + length - 1) * frameSize + idx; } } - __device__ inline real *getPtr() const {return value_;} - __device__ inline real getValue() {return *value_;} - __device__ inline void setValue(real value) {*value_ = value;} + __device__ inline real *getPtr() const { return value_; } + __device__ inline real getValue() { return *value_; } + __device__ inline void setValue(real value) { *value_ = value; } template __device__ inline void nextFrame() { if (reversed == 0) { @@ -55,28 +54,25 @@ public: } }; -__device__ __forceinline__ -void ptx_sync(const int id, const int barriers) { +__device__ __forceinline__ void ptx_sync(const int id, const int barriers) { asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory"); } -__device__ __forceinline__ -void ptx_arrive(const int id, const int barriers) { +__device__ __forceinline__ void ptx_arrive(const int id, const int barriers) { asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory"); } -template -__device__ __forceinline__ real -forward_sequence(real value, - real *shValue, - real *state, - real *preOutput, - real *output, - real check, - int index, - t_forward activeNode, - t_forward activeGate, - t_forward activeState) { +template +__device__ __forceinline__ real forward_sequence(real value, + real *shValue, + real *state, + real *preOutput, + real *output, + real check, + int index, + t_forward activeNode, + t_forward activeGate, + t_forward activeState) { real out; real prevOut; real state_r; @@ -112,17 +108,20 @@ forward_sequence(real value, if (idy == 0) { ptx_sync(2, frameSize * 2); prevOut = state[idx]; - prevOut = activeState(prevOut); + prevOut = activeState(prevOut); preOutput[idx] = prevOut; ptx_arrive(3, frameSize * 2); } return value; } -#define OUTPUT_BARRIER_ID 10 -#define OUTPUT_BARRIER_ID2 11 -template +#define OUTPUT_BARRIER_ID 10 +#define OUTPUT_BARRIER_ID2 11 +template __global__ void KeLstmForward(real *gateValue, real *state, real *output, @@ -184,10 +183,16 @@ __global__ void KeLstmForward(real *gateValue, } } value = forward_sequence( - value, shValue, shState, shPrevOutput, shOutput, check, index, - hppl::gpu::forward[active_node], - hppl::gpu::forward[active_gate], - hppl::gpu::forward[active_state]); + value, + shValue, + shState, + shPrevOutput, + shOutput, + check, + index, + hppl::gpu::forward[active_node], + hppl::gpu::forward[active_gate], + hppl::gpu::forward[active_state]); const int idx = index % frameSize; const int idy = index / frameSize; if (valueSize == 128) { @@ -218,7 +223,7 @@ __global__ void KeLstmForward(real *gateValue, real B_r[frameSize]; const int computeIdx = index - valueSize; if (i == 0) { - #pragma unroll +#pragma unroll for (int n = 0; n < frameSize; n++) { B_r[n] = weight[n * valueSize + computeIdx]; } @@ -230,7 +235,7 @@ __global__ void KeLstmForward(real *gateValue, } real sum = 0.0f; for (int n = 0; n < frameSize; n++) { - sum += A_r[n]*B_r[n]; + sum += A_r[n] * B_r[n]; } shValue[computeIdx] = sum; ptx_arrive(OUTPUT_BARRIER_ID2, blockSize); @@ -239,14 +244,14 @@ __global__ void KeLstmForward(real *gateValue, if (valueSize == 256) { real B_r[frameSize]; if (i == 0) { - #pragma unroll +#pragma unroll for (int n = 0; n < frameSize; n++) { B_r[n] = weight[n * valueSize + index]; } } real sum = 0.0f; for (int n = 0; n < frameSize; n++) { - sum += shOutput[n]*B_r[n]; + sum += shOutput[n] * B_r[n]; } value += sum; } @@ -273,50 +278,81 @@ void hl_lstm_parallel_forward(real *gateValue, dim3 grid(numSequences, 1); if (!reversed) { if (frameSize == 32) { - KeLstmForward<128, 32, 0, 128, 256> - <<>> - (gateValue, stateValue, outputValue, preOutputValue, - checkIg, checkFg, checkOg, weight, sequence, - active_node, active_gate, active_state); + KeLstmForward<128, 32, 0, 128, 256><<>>( + gateValue, + stateValue, + outputValue, + preOutputValue, + checkIg, + checkFg, + checkOg, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 64) { - KeLstmForward<256, 64, 0, 256, 256> - <<>> - (gateValue, stateValue, outputValue, preOutputValue, - checkIg, checkFg, checkOg, weight, sequence, - active_node, active_gate, active_state); + KeLstmForward<256, 64, 0, 256, 256><<>>( + gateValue, + stateValue, + outputValue, + preOutputValue, + checkIg, + checkFg, + checkOg, + weight, + sequence, + active_node, + active_gate, + active_state); } } else { if (frameSize == 32) { - KeLstmForward<128, 32, 1, 128, 256> - <<>> - (gateValue, stateValue, outputValue, preOutputValue, - checkIg, checkFg, checkOg, weight, sequence, - active_node, active_gate, active_state); + KeLstmForward<128, 32, 1, 128, 256><<>>( + gateValue, + stateValue, + outputValue, + preOutputValue, + checkIg, + checkFg, + checkOg, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 64) { - KeLstmForward<256, 64, 1, 256, 256> - <<>> - (gateValue, stateValue, outputValue, preOutputValue, - checkIg, checkFg, checkOg, weight, sequence, - active_node, active_gate, active_state); + KeLstmForward<256, 64, 1, 256, 256><<>>( + gateValue, + stateValue, + outputValue, + preOutputValue, + checkIg, + checkFg, + checkOg, + weight, + sequence, + active_node, + active_gate, + active_state); } } CHECK_SYNC("hl_lstm_parallel_forward failed"); } -__device__ __forceinline__ -void transpose_32x32(real a[], const int idx) { +__device__ __forceinline__ void transpose_32x32(real a[], const int idx) { int addr = idx % 32; - #pragma unroll +#pragma unroll for (int k = 1; k < 32; k++) { // rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32); addr = __shfl(addr, (idx + 1) % 32, 32); a[k] = __shfl(a[k], addr, 32); } - #pragma unroll +#pragma unroll for (int tid = 0; tid < 31; tid++) { real tmp = (idx > tid) ? a[0] : a[1]; - #pragma unroll +#pragma unroll for (int k = 31; k > 0; k--) { a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32]; } @@ -324,29 +360,28 @@ void transpose_32x32(real a[], const int idx) { } addr = (32 - idx) % 32; - #pragma unroll +#pragma unroll for (int k = 0; k < 32; k++) { a[k] = __shfl(a[k], addr, 32); addr = __shfl(addr, (idx + 31) % 32, 32); } } -template -__device__ void -backward_sequence(real rGateValue, - real rOutputGrad, - real rPreOutputValue, - real &rGateGrad, - real &rStateGrad, - real *shStateGrad, - real *shStateValue, - real *shGateValue, - real rCheck, - real &rGateValuePrev, - int index, - t_backward activeNode, - t_backward activeGate, - t_backward activeState) { +template +__device__ void backward_sequence(real rGateValue, + real rOutputGrad, + real rPreOutputValue, + real &rGateGrad, + real &rStateGrad, + real *shStateGrad, + real *shStateValue, + real *shGateValue, + real rCheck, + real &rGateValuePrev, + int index, + t_backward activeNode, + t_backward activeGate, + t_backward activeState) { const int frameIdx = index % frameSize; const int frameIdy = index / frameSize; if (frameIdy == 3) { @@ -363,8 +398,8 @@ backward_sequence(real rGateValue, rStateGrad = rGateGrad * rCheck; shStateGrad[index] = rStateGrad; ptx_sync(3, valueSize); - rStateGrad += shStateGrad[frameIdx + frameSize *2]; - rStateGrad += shStateGrad[frameIdx + frameSize *3]; + rStateGrad += shStateGrad[frameIdx + frameSize * 2]; + rStateGrad += shStateGrad[frameIdx + frameSize * 3]; rGateGrad = rStateGrad * shGateValue[frameIdx]; rGateGrad = activeGate(rGateGrad, rGateValue); } else if (frameIdy == 2) { @@ -373,7 +408,7 @@ backward_sequence(real rGateValue, shStateGrad[index] = rStateGrad; ptx_sync(3, valueSize); rStateGrad += shStateGrad[frameIdx + frameSize]; - rStateGrad += shStateGrad[frameIdx + frameSize *3]; + rStateGrad += shStateGrad[frameIdx + frameSize * 3]; rGateValuePrev = rGateValue; rGateGrad = rStateGrad * shStateValue[frameIdx]; rGateGrad = activeGate(rGateGrad, rGateValue); @@ -381,43 +416,43 @@ backward_sequence(real rGateValue, shGateValue[frameIdx] = rGateValue; ptx_sync(3, valueSize); rStateGrad = shStateGrad[frameIdx + frameSize]; - rStateGrad += shStateGrad[frameIdx + frameSize *2]; - rStateGrad += shStateGrad[frameIdx + frameSize *3]; + rStateGrad += shStateGrad[frameIdx + frameSize * 2]; + rStateGrad += shStateGrad[frameIdx + frameSize * 3]; rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize]; rGateGrad = activeNode(rGateGrad, rGateValue); } } -template +template __device__ void load_weight(real rWeight[], real *weight, const int index) { if (valueSize == 128) { weight += index; - #pragma unroll +#pragma unroll for (int n = 0; n < frameSize; n++) { - rWeight[n] = weight[n*valueSize]; + rWeight[n] = weight[n * valueSize]; } transpose_32x32(rWeight, index % 32); } if (valueSize == 256) { int id = (index / 32) % 2; weight += index - id * 32 + id * 32 * valueSize; - #pragma unroll +#pragma unroll for (int n = 0; n < 32; n++) { - rWeight[n] = weight[n*valueSize]; - rWeight[n + 32] = weight[n*valueSize + 32]; + rWeight[n] = weight[n * valueSize]; + rWeight[n + 32] = weight[n * valueSize + 32]; } transpose_32x32(rWeight, index % 32); transpose_32x32(&rWeight[32], index % 32); } } -template +template __global__ void KeLstmBackward(real *gateValue, real *gateGrad, real *stateValue, - real *stateGrad, /* do not need save */ + real *stateGrad, /* do not need save */ real *preOutputValue, - real *preOutputGrad, /* do not need save */ + real *preOutputGrad, /* do not need save */ real *checkIg, real *checkIgGrad, real *checkFg, @@ -484,20 +519,27 @@ __global__ void KeLstmBackward(real *gateValue, for (int i = 0; i < length; ++i) { if (frameIdy == 3) { - if (i != length -1) { + if (i != length - 1) { frameStateValue.nextFrame(); shStateValue[frameIdx] = frameStateValue.getValue(); } else { shStateValue[frameIdx] = 0.0; } } - backward_sequence( - rGateValue, rOutputGrad, rPreOutputValue, rGateGrad, - rStateGrad, shStateGrad, shStateValue, shGateValue, - rCheck, rGateValuePrev, index, - hppl::gpu::backward[active_node], - hppl::gpu::backward[active_gate], - hppl::gpu::backward[active_state]); + backward_sequence(rGateValue, + rOutputGrad, + rPreOutputValue, + rGateGrad, + rStateGrad, + shStateGrad, + shStateValue, + shGateValue, + rCheck, + rGateValuePrev, + index, + hppl::gpu::backward[active_node], + hppl::gpu::backward[active_gate], + hppl::gpu::backward[active_state]); if (frameIdy == 3) { rCheckGrad += rGateGrad * rStateValue; rStateValue = shStateValue[frameIdx]; @@ -523,9 +565,9 @@ __global__ void KeLstmBackward(real *gateValue, shGateGrad[frameIdy][frameIdx] = rGateGrad; if (valueSize == 128) { real sum = 0.0f; - #pragma unroll +#pragma unroll for (int n = 0; n < frameSize; n++) { - sum += shGateGrad[frameIdy][n]*B_r[n]; + sum += shGateGrad[frameIdy][n] * B_r[n]; } if (frameIdy == 3) { rOutputGrad += sum; @@ -541,7 +583,7 @@ __global__ void KeLstmBackward(real *gateValue, } real sum = 0.0f; for (int n = 0; n < frameSize; n++) { - sum += A_r[n]*B_r[n]; + sum += A_r[n] * B_r[n]; } if (frameIdy == 3) { rOutputGrad += sum; @@ -552,8 +594,8 @@ __global__ void KeLstmBackward(real *gateValue, if (frameIdy == 3) { ptx_sync(6, valueSize); - #pragma unroll - for (int i = 0; i < 3; i ++) { +#pragma unroll + for (int i = 0; i < 3; i++) { rOutputGrad += shOutputGrad[i][frameIdx]; } } else { @@ -564,11 +606,14 @@ __global__ void KeLstmBackward(real *gateValue, /* TODO: Temporary save & merger in another kernel */ if (frameIdy == 1) { - if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad); + if (checkIgGrad) + paddle::paddleAtomicAdd(checkIgGrad + frameIdx, rCheckGrad); } else if (frameIdy == 2) { - if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad); + if (checkFgGrad) + paddle::paddleAtomicAdd(checkFgGrad + frameIdx, rCheckGrad); } else if (frameIdy == 3) { - if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad); + if (checkOgGrad) + paddle::paddleAtomicAdd(checkOgGrad + frameIdx, rCheckGrad); } } @@ -593,68 +638,183 @@ void hl_lstm_parallel_backward_data(real *gateValue, hl_activation_mode_t active_node, hl_activation_mode_t active_gate, hl_activation_mode_t active_state) { - CHECK(frameSize == 32 || frameSize == 64 || - frameSize == 128 || frameSize == 256); + CHECK(frameSize == 32 || frameSize == 64 || frameSize == 128 || + frameSize == 256); dim3 grid(numSequences, 1); if (!reversed) { if (frameSize == 32) { - KeLstmBackward<128, 32, 0><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<128, 32, 0><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 64) { - KeLstmBackward<256, 64, 0><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<256, 64, 0><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 128) { - KeLstmBackward<512, 128, 0><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<512, 128, 0><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 256) { - KeLstmBackward<1024, 256, 0><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<1024, 256, 0><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } } else { if (frameSize == 32) { - KeLstmBackward<128, 32, 1><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<128, 32, 1><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 64) { - KeLstmBackward<256, 64, 1><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<256, 64, 1><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 128) { - KeLstmBackward<512, 128, 1><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<512, 128, 1><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } else if (frameSize == 256) { - KeLstmBackward<1024, 256, 1><<>> - (gateValue, gateGrad, stateValue, stateGrad, preOutputValue, - preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, outputGrad, weight, sequence, - active_node, active_gate, active_state); + KeLstmBackward<1024, 256, 1><<>>( + gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + outputGrad, + weight, + sequence, + active_node, + active_gate, + active_state); } } CHECK_SYNC("hl_lstm_parallel_backward_data"); } -template +template __global__ void KeSetGradZero(real *gateGrad, - const int *starts, int valueSize, int numSequences, bool reversed) { + const int *starts, + int valueSize, + int numSequences, + bool reversed) { // const int tid = threadIdx.x; const int frameIdx = blockIdx.x * B_X + threadIdx.x; @@ -682,19 +842,31 @@ void hl_lstm_parallel_backward_weight(real *weightGrad, int valueSize = 4 * frameSize; dim3 threads(32, 32); dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32); - KeSetGradZero<32, 32><<>> - (gateGrad, sequence, valueSize, numSequences, reversed); + KeSetGradZero<32, 32><<>>( + gateGrad, sequence, valueSize, numSequences, reversed); if (!reversed) { hl_matrix_mul(outputValue, - HPPL_OP_T, gateGrad + valueSize, HPPL_OP_N, weightGrad, - frameSize, valueSize, batchSize - 1, - 1.0, 1.0); + HPPL_OP_T, + gateGrad + valueSize, + HPPL_OP_N, + weightGrad, + frameSize, + valueSize, + batchSize - 1, + 1.0, + 1.0); } else { hl_matrix_mul(outputValue + frameSize, - HPPL_OP_T, gateGrad, HPPL_OP_N, weightGrad, - frameSize, valueSize, batchSize - 1, - 1.0, 1.0); + HPPL_OP_T, + gateGrad, + HPPL_OP_N, + weightGrad, + frameSize, + valueSize, + batchSize - 1, + 1.0, + 1.0); } CHECK_SYNC("hl_lstm_parallel_backward_weight"); } diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu index 9bcc7fb7de44b2211db450fb164655f7947dcad9..39272456c394adc0509e60cf5972df832f7b3424 100644 --- a/paddle/cuda/src/hl_cuda_matrix.cu +++ b/paddle/cuda/src/hl_cuda_matrix.cu @@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "hl_base.h" +#include "hl_device_functions.cuh" +#include "hl_gpu_matrix_kernel.cuh" #include "hl_matrix.h" -#include "hl_matrix_ops.cuh" #include "hl_matrix_apply.cuh" +#include "hl_matrix_ops.cuh" #include "hl_sequence.h" #include "hl_sparse.ph" #include "paddle/utils/Logging.h" -#include "hl_device_functions.cuh" -#include "hl_gpu_matrix_kernel.cuh" DEFINE_MATRIX_UNARY_OP(Zero, a = 0); -DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b); -void hl_matrix_add(real *A_d, - real *B_d, - real *C_d, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b); +void hl_matrix_add(real* A_d, + real* B_d, + real* C_d, int dimM, int dimN, real alpha, @@ -36,33 +35,32 @@ void hl_matrix_add(real *A_d, CHECK_NOTNULL(B_d); CHECK_NOTNULL(C_d); - hl_gpu_apply_ternary_op - , 0, 0>(ternary::_add(alpha, beta), - A_d, - B_d, - C_d, - dimM, - dimN, - dimN, - dimN, - dimN); + hl_gpu_apply_ternary_op, 0, 0>( + ternary::_add(alpha, beta), + A_d, + B_d, + C_d, + dimM, + dimN, + dimN, + dimN, + dimN); CHECK_SYNC("hl_matrix_add failed"); } #ifdef PADDLE_TYPE_DOUBLE - #define THRESHOLD 128 +#define THRESHOLD 128 #else - #define THRESHOLD 64 +#define THRESHOLD 64 #endif -__device__ __forceinline__ -void findMax(real* I, - real* dfMax_s, - int blockSize, - int base, - int curIdx, - int nextIdx, - int dimN, - real* max) { +__device__ __forceinline__ void findMax(real* I, + real* dfMax_s, + int blockSize, + int base, + int curIdx, + int nextIdx, + int dimN, + real* max) { dfMax_s[base] = -1.0e20; while (curIdx < dimN) { if (dfMax_s[base] < I[nextIdx]) { @@ -78,25 +76,24 @@ void findMax(real* I, if (base < stride) { nextIdx = base + stride; if (dfMax_s[base] < dfMax_s[nextIdx]) { - dfMax_s[base] = dfMax_s[nextIdx]; + dfMax_s[base] = dfMax_s[nextIdx]; } } } - if (0 == base) { + if (0 == base) { max[0] = dfMax_s[0]; } __syncthreads(); } -__device__ __forceinline__ -void subMaxAndExp(real* I, - real* O, - int curIdx, - int nextIdx, - int blockSize, - int dimN, - real max) { +__device__ __forceinline__ void subMaxAndExp(real* I, + real* O, + int curIdx, + int nextIdx, + int blockSize, + int dimN, + real max) { real val; while (curIdx < dimN) { val = I[nextIdx] - max; @@ -115,14 +112,13 @@ void subMaxAndExp(real* I, __syncthreads(); } -__device__ __forceinline__ -void valueSum(real* O, - real* dfMax_s, - int blockSize, - int base, - int curIdx, - int nextIdx, - int dimN) { +__device__ __forceinline__ void valueSum(real* O, + real* dfMax_s, + int blockSize, + int base, + int curIdx, + int nextIdx, + int dimN) { dfMax_s[base] = 0; while (curIdx < dimN) { dfMax_s[base] += O[nextIdx]; @@ -141,13 +137,8 @@ void valueSum(real* O, __syncthreads(); } -__device__ __forceinline__ -void divSum(real* O, - real sum, - int curIdx, - int nextIdx, - int blockSize, - int dimN) { +__device__ __forceinline__ void divSum( + real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) { while (curIdx < dimN) { O[nextIdx] /= sum; nextIdx += blockSize; @@ -155,20 +146,18 @@ void divSum(real* O, } } -__device__ __forceinline__ -void softmax(real* I, - real* O, - real* dfMax_s, - int blockSize, - int base, - int curIdx, - int nextIdx, - int dimN) { +__device__ __forceinline__ void softmax(real* I, + real* O, + real* dfMax_s, + int blockSize, + int base, + int curIdx, + int nextIdx, + int dimN) { __shared__ real max; // find the max number - findMax(I, dfMax_s, blockSize, base, curIdx, - nextIdx, dimN, &max); + findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max); // sub max Value and do Exp operation subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max); @@ -181,8 +170,8 @@ void softmax(real* I, divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN); } -template -__global__ void KeMatrixSoftMax(real *O, real *I, int dimN) { +template +__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) { int base = threadIdx.x; __shared__ real dfMax_s[blockSize]; int nextIdx = blockIdx.x * dimN + base; @@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) { softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN); } -void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) { +void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); dim3 block(512, 1); dim3 grid(dimM, 1); - KeMatrixSoftMax<512> - <<>>(C_d, A_d, dimN); + KeMatrixSoftMax<512><<>>(C_d, A_d, dimN); CHECK_SYNC("hl_matrix_softmax failed"); } -template -__global__ void KeSequenceSoftMax(real *O, real *I, const int* index) { +template +__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) { int base = threadIdx.x; int bid = blockIdx.x; __shared__ real dfMax_s[blockSize]; @@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) { softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN); } -void hl_sequence_softmax_forward(real *A_d, - real *C_d, +void hl_sequence_softmax_forward(real* A_d, + real* C_d, const int* index, int numSequence) { CHECK_NOTNULL(A_d); @@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d, dim3 block(512, 1); dim3 grid(numSequence, 1); - KeSequenceSoftMax<512> - <<>>(C_d, A_d, index); + KeSequenceSoftMax<512><<>>(C_d, A_d, index); CHECK_SYNC("hl_sequence_softmax_forward failed"); } -__global__ void KeMatrixDerivative(real *grad_d, - real *output_d, - real *sftmaxSum_d, - int dimM, - int dimN) { - int rowIdx = blockIdx.x*blockDim.x + threadIdx.x; - int colIdx = blockIdx.y*blockDim.y + threadIdx.y; +__global__ void KeMatrixDerivative( + real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) { + int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; + int colIdx = blockIdx.y * blockDim.y + threadIdx.y; int index; if (rowIdx < dimM && colIdx < dimN) { - index = rowIdx*dimN + colIdx; + index = rowIdx * dimN + colIdx; grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]); } } -void hl_matrix_softmax_derivative(real *grad_d, - real *output_d, - real *sftmaxSum_d, - int dimM, - int dimN) { +void hl_matrix_softmax_derivative( + real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) { CHECK_NOTNULL(grad_d); CHECK_NOTNULL(output_d); CHECK_NOTNULL(sftmaxSum_d); int blocksX = (dimM + 0) / 1; - int blocksY = (dimN + 1024 -1) / 1024; + int blocksY = (dimN + 1024 - 1) / 1024; dim3 threads(1, 1024); dim3 grid(blocksX, blocksY); - KeMatrixDerivative<<< grid, threads, 0, STREAM_DEFAULT >>> - (grad_d, output_d, sftmaxSum_d, dimM, dimN); + KeMatrixDerivative<<>>( + grad_d, output_d, sftmaxSum_d, dimM, dimN); CHECK_SYNC("hl_matrix_softmax_derivative failed"); } -__global__ void KeMatrixMultiBinaryCrossEntropy(real* output, - real* entropy, - int* row, - int* col, - int dimM, - int dimN) { +__global__ void KeMatrixMultiBinaryCrossEntropy( + real* output, real* entropy, int* row, int* col, int dimM, int dimN) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < dimM) { - for (int i = 0; i < dimN; i ++) { + for (int i = 0; i < dimN; i++) { entropy[index] -= log(1 - output[index * dimN + i]); } - int *row_col = col + row[index]; + int* row_col = col + row[index]; int col_num = row[index + 1] - row[index]; - for (int i = 0; i < col_num; i ++) { + for (int i = 0; i < col_num; i++) { real o = output[index * dimN + row_col[i]]; entropy[index] -= log(o / (1 - o)); } @@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output, dim3 threads(n_threads); dim3 grid(blocks); hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix); - KeMatrixMultiBinaryCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>> - (output, entropy, mat->csr_row, mat->csr_col, dimM, dimN); + KeMatrixMultiBinaryCrossEntropy<<>>( + output, entropy, mat->csr_row, mat->csr_col, dimM, dimN); CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed"); } -__global__ void KeMatrixMultiBinaryCrossEntropyBp(real* output, - real* grad, - int* row, - int* col, - int dimM, - int dimN) { +__global__ void KeMatrixMultiBinaryCrossEntropyBp( + real* output, real* grad, int* row, int* col, int dimM, int dimN) { int row_idx = blockIdx.x * blockDim.x + threadIdx.x; if (row_idx < dimM) { - for (int i = 0; i < dimN; i ++) { + for (int i = 0; i < dimN; i++) { int index = row_idx * dimN + i; grad[index] += 1.0 / (1 - output[index]); } int col_num = row[row_idx + 1] - row[row_idx]; - int *row_col = col + row[row_idx]; - for (int i = 0; i < col_num; i ++) { + int* row_col = col + row[row_idx]; + for (int i = 0; i < col_num; i++) { int index = row_idx * dimN + row_col[i]; grad[index] -= 1.0 / (output[index] * (1 - output[index])); } } } -void hl_matrix_multi_binary_cross_entropy_bp(real* output, - real* grad, - hl_sparse_matrix_s csr_mat, - int dimM, - int dimN) { +void hl_matrix_multi_binary_cross_entropy_bp( + real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) { CHECK_NOTNULL(output); CHECK_NOTNULL(grad); CHECK_NOTNULL(csr_mat); @@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output, dim3 threads(n_threads); dim3 grid(blocks); hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix); - KeMatrixMultiBinaryCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>> - (output, grad, mat->csr_row, mat->csr_col, dimM, dimN); + KeMatrixMultiBinaryCrossEntropyBp<<>>( + output, grad, mat->csr_row, mat->csr_col, dimM, dimN); CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed"); } -__global__ void KeMatrixCrossEntropy(real* O, - real* E, - int* label, - int dimM, - int dimN) { +__global__ void KeMatrixCrossEntropy( + real* O, real* E, int* label, int dimM, int dimN) { int index = blockIdx.x * blockDim.x + threadIdx.x; int newBase; if (index < dimM) { @@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O, } } -void hl_matrix_cross_entropy(real* A_d, - real* C_d, - int* label_d, - int dimM, - int dimN) { +void hl_matrix_cross_entropy( + real* A_d, real* C_d, int* label_d, int dimM, int dimN) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(C_d); int blocks = (dimM + 1024 - 1) / 1024; dim3 threads(1024, 1); dim3 grid(blocks, 1); - KeMatrixCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>> - (A_d, C_d, label_d, dimM, dimN); + KeMatrixCrossEntropy<<>>( + A_d, C_d, label_d, dimM, dimN); CHECK_SYNC("hl_matrix_cross_entropy failed"); } -__global__ void KeMatrixCrossEntropyBp(real* grad_d, - real* output_d, - int* label_d, - int dimM, - int dimN) { - int rowIdx = blockIdx.x*blockDim.x + threadIdx.x; - int colIdx = blockIdx.y*blockDim.y + threadIdx.y; +__global__ void KeMatrixCrossEntropyBp( + real* grad_d, real* output_d, int* label_d, int dimM, int dimN) { + int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; + int colIdx = blockIdx.y * blockDim.y + threadIdx.y; int index; if (rowIdx < dimM && colIdx < dimN) { - index = rowIdx*dimN + colIdx; + index = rowIdx * dimN + colIdx; if (label_d[rowIdx] == colIdx) { grad_d[index] -= 1.0f / output_d[index]; } } } -void hl_matrix_cross_entropy_bp(real* grad_d, - real* output_d, - int* label_d, - int dimM, - int dimN) { +void hl_matrix_cross_entropy_bp( + real* grad_d, real* output_d, int* label_d, int dimM, int dimN) { CHECK_NOTNULL(grad_d); CHECK_NOTNULL(output_d); CHECK_NOTNULL(label_d); - int blocksX = (dimM + 0)/1; - int blocksY = (dimN + 1024 -1) / 1024; + int blocksX = (dimM + 0) / 1; + int blocksY = (dimN + 1024 - 1) / 1024; dim3 threads(1, 1024); dim3 grid(blocksX, blocksY); - KeMatrixCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>> - (grad_d, output_d, label_d, dimM, dimN); + KeMatrixCrossEntropyBp<<>>( + grad_d, output_d, label_d, dimM, dimN); CHECK_SYNC("hl_matrix_cross_entropy_bp failed"); } void hl_matrix_zero_mem(real* data, int num) { - hl_gpu_apply_unary_op( - unary::Zero(), data, 1, num, num); + hl_gpu_apply_unary_op(unary::Zero(), data, 1, num, num); } __global__ void KeParamReluForward(real* output, @@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output, int ty = blockIdx.y * blockDim.y + threadIdx.y; if (tx < width && ty < height) { int index = ty * width + tx; - output[index] = input[index] > 0 ? input[index] : - input[index] * w[tx / partial_sum]; + output[index] = + input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum]; } } @@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output, CHECK_NOTNULL(w); dim3 threads(16, 16); int blockX = (width + 16 - 1) / 16; - int blockY = (height + 16 -1) / 16; + int blockY = (height + 16 - 1) / 16; dim3 grid(blockX, blockY); - KeParamReluForward<<>> - (output, input, w, width, height, partial_sum); + KeParamReluForward<<>>( + output, input, w, width, height, partial_sum); CHECK_SYNC("hl_param_relu_forward failed"); } -template +template __global__ void KeParamReluBackWardW(real* grad_w, real* grad_o, real* input, @@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w, int grid_num = width / partial_sum; dim3 threads(blockSize, 1); dim3 grid(grid_num, 1); - KeParamReluBackWardW<<>> - (grad_w, grad_o, input, width, height, partial_sum); + KeParamReluBackWardW<<>>( + grad_w, grad_o, input, width, height, partial_sum); CHECK_SYNC("hl_param_relu_backward_w failed"); } @@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o, CHECK_NOTNULL(diff); dim3 threads(16, 16); int blockX = (width + 16 - 1) / 16; - int blockY = (height + 16 -1) / 16; + int blockY = (height + 16 - 1) / 16; dim3 grid(blockX, blockY); - KeParamReluBackwardDiff<<>> - (grad_o, data, w, diff, width, height, partial_sum); + KeParamReluBackwardDiff<<>>( + grad_o, data, w, diff, width, height, partial_sum); CHECK_SYNC("hl_param_relu_backward_diff failed"); } -__global__ void KeMatrixAddSharedBias(real* A, - real* B, - const int channel, - const int M, - const int N, - real scale) { +__global__ void KeMatrixAddSharedBias( + real* A, real* B, const int channel, const int M, const int N, real scale) { int index = blockIdx.x * blockDim.x + threadIdx.x; int dim = N / channel; if (index < M * N) { @@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d, real scale) { const int blocks = 512; const int grids = DIVUP(dimM * dimN, blocks); - KeMatrixAddSharedBias<<>> - (A_d, B_d, channel, dimM, dimN, scale); + KeMatrixAddSharedBias<<>>( + A_d, B_d, channel, dimM, dimN, scale); CHECK_SYNC("hl_matrix_add_shared_bias failed"); } - template -__global__ void KeMatrixCollectSharedBias(real *B, - real *A, +__global__ void KeMatrixCollectSharedBias(real* B, + real* A, const int channel, const int M, const int N, @@ -589,7 +541,7 @@ __global__ void KeMatrixCollectSharedBias(real *B, int n = j * blockSize + tid; int m = n / dim; int w = n % dim; - smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0; + smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0; __syncthreads(); simpleReduce(smem, tid, blockSize); sum += smem[0]; @@ -611,33 +563,32 @@ void hl_matrix_collect_shared_bias(real* B_d, const int limit = 64; int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel; - KeMatrixCollectSharedBias - <<< grids, blocks, 0, STREAM_DEFAULT>>> - (B_d, A_d, channel, dimM, dimN, dim, limit, scale); + KeMatrixCollectSharedBias<<>>( + B_d, A_d, channel, dimM, dimN, dim, limit, scale); CHECK_SYNC("hl_matrix_collect_shared_bias failed"); } -__global__ void keMatrixRotate(real* mat, real* matRot, - int dimM, int dimN, bool clockWise) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < dimM * dimN) { - int i = idx / dimN; - int j = idx % dimN; - if (clockWise) { - matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j]; - } else { - matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)]; - } +__global__ void keMatrixRotate( + real* mat, real* matRot, int dimM, int dimN, bool clockWise) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < dimM * dimN) { + int i = idx / dimN; + int j = idx % dimN; + if (clockWise) { + matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j]; + } else { + matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)]; } + } } -void hl_matrix_rotate(real *mat, real* matRot, - int dimM, int dimN, bool clockWise) { - CHECK_NOTNULL(mat); - CHECK_NOTNULL(matRot); - const int threads = 512; - const int blocks = DIVUP(dimM * dimN, threads); - keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>> - (mat, matRot, dimM, dimN, clockWise); - CHECK_SYNC("hl_matrix_rotate failed"); +void hl_matrix_rotate( + real* mat, real* matRot, int dimM, int dimN, bool clockWise) { + CHECK_NOTNULL(mat); + CHECK_NOTNULL(matRot); + const int threads = 512; + const int blocks = DIVUP(dimM * dimN, threads); + keMatrixRotate<<>>( + mat, matRot, dimM, dimN, clockWise); + CHECK_SYNC("hl_matrix_rotate failed"); } diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu index eeee921db54e20ea6a017d2b83f2d7ca9e5e037e..c52780dfcaff6e5b94d3568fac4ca011b76a1442 100644 --- a/paddle/cuda/src/hl_cuda_sequence.cu +++ b/paddle/cuda/src/hl_cuda_sequence.cu @@ -16,36 +16,36 @@ limitations under the License. */ #include "hl_device_functions.cuh" #include "paddle/utils/Logging.h" -__global__ void KeMaxSequenceForward(real *input, - const int *sequence, +__global__ void KeMaxSequenceForward(real* input, + const int* sequence, real* output, - int *index, + int* index, int numSequences, int dim) { int dimIdx = threadIdx.x; int sequenceId = blockIdx.x; if (sequenceId >= numSequences) return; int start = sequence[sequenceId]; - int end = sequence[sequenceId+1]; + int end = sequence[sequenceId + 1]; for (int i = dimIdx; i < dim; i += blockDim.x) { real tmp = -HL_FLOAT_MAX; int tmpId = -1; for (int insId = start; insId < end; insId++) { - if (tmp < input[insId*dim + i]) { - tmp = input[insId*dim + i]; + if (tmp < input[insId * dim + i]) { + tmp = input[insId * dim + i]; tmpId = insId; } } - output[sequenceId*dim + i] = tmp; - index[sequenceId*dim + i] = tmpId; + output[sequenceId * dim + i] = tmp; + index[sequenceId * dim + i] = tmpId; } } void hl_max_sequence_forward(real* input, const int* sequence, real* output, - int *index, + int* index, int numSequences, int dim) { CHECK_NOTNULL(input); @@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input, dim3 threads(256, 1); dim3 grid(numSequences, 1); - KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>> - (input, sequence, output, index, numSequences, dim); + KeMaxSequenceForward<<>>( + input, sequence, output, index, numSequences, dim); CHECK_SYNC("hl_max_sequence_forward failed"); } -__global__ void KeMaxSequenceBackward(real *outputGrad, - int *index, - real* inputGrad, - int numSequences, - int dim) { +__global__ void KeMaxSequenceBackward( + real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) { int idx = threadIdx.x + blockIdx.x * blockDim.x; int colIdx = idx % dim; - if (idx < numSequences*dim) { + if (idx < numSequences * dim) { int insId = index[idx]; inputGrad[insId * dim + colIdx] += outputGrad[idx]; } } -void hl_max_sequence_backward(real* outputGrad, - int *index, - real* inputGrad, - int numSequences, - int dim) { +void hl_max_sequence_backward( + real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) { CHECK_NOTNULL(outputGrad); CHECK_NOTNULL(index); CHECK_NOTNULL(inputGrad); @@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad, unsigned int blocks = (numSequences * dim + 128 - 1) / 128; dim3 threads(128, 1); dim3 grid(blocks, 1); - KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>> - (outputGrad, index, inputGrad, numSequences, dim); + KeMaxSequenceBackward<<>>( + outputGrad, index, inputGrad, numSequences, dim); CHECK_SYNC("hl_max_sequence_backward failed"); } -template +template __global__ void KeMatrixAddRows(real* output, real* table, int* ids, @@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output, while (sampleId < numSamples) { int tableId = ids[sampleId]; if ((0 <= tableId) && (tableId < tableSize)) { - real *outputData = output + sampleId * dim; - real *tableData = table + tableId * dim; + real* outputData = output + sampleId * dim; + real* tableData = table + tableId * dim; for (int i = idx; i < dim; i += blockDimX) { if (AddRow == 0) { outputData[i] += tableData[i]; @@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output, } } } - sampleId += blockDimY*gridDimX; + sampleId += blockDimY * gridDimX; } } -template -__global__ -void KeSequence2Batch(real *batch, - real *sequence, - const int *batchIndex, - int seqWidth, - int batchCount) { +template +__global__ void KeSequence2Batch(real* batch, + real* sequence, + const int* batchIndex, + int seqWidth, + int batchCount) { int idx = threadIdx.x; int idy = threadIdx.y; int id = blockIdx.x + idy * gridDimX; while (id < batchCount) { int seqId = batchIndex[id]; - real* batchData = batch + id*seqWidth; - real* seqData = sequence + seqId*seqWidth; + real* batchData = batch + id * seqWidth; + real* seqData = sequence + seqId * seqWidth; for (int i = idx; i < seqWidth; i += blockDimX) { if (seq2batch) { if (isAdd) { @@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch, } } } - id += blockDimY*gridDimX; + id += blockDimY * gridDimX; } } -void hl_sequence2batch_copy(real *batch, - real *sequence, - const int *batchIndex, +void hl_sequence2batch_copy(real* batch, + real* sequence, + const int* batchIndex, int seqWidth, int batchCount, bool seq2batch) { @@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch, dim3 threads(128, 8); dim3 grid(8, 1); if (seq2batch) { - KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>> - (batch, sequence, batchIndex, seqWidth, batchCount); + KeSequence2Batch<128, 8, 8, 1, 0><<>>( + batch, sequence, batchIndex, seqWidth, batchCount); } else { - KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>> - (batch, sequence, batchIndex, seqWidth, batchCount); + KeSequence2Batch<128, 8, 8, 0, 0><<>>( + batch, sequence, batchIndex, seqWidth, batchCount); } CHECK_SYNC("hl_sequence2batch_copy failed"); } -void hl_sequence2batch_add(real *batch, - real *sequence, - int *batchIndex, +void hl_sequence2batch_add(real* batch, + real* sequence, + int* batchIndex, int seqWidth, int batchCount, bool seq2batch) { @@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch, dim3 threads(128, 8); dim3 grid(8, 1); if (seq2batch) { - KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>> - (batch, sequence, batchIndex, seqWidth, batchCount); + KeSequence2Batch<128, 8, 8, 1, 1><<>>( + batch, sequence, batchIndex, seqWidth, batchCount); } else { - KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>> - (batch, sequence, batchIndex, seqWidth, batchCount); + KeSequence2Batch<128, 8, 8, 0, 1><<>>( + batch, sequence, batchIndex, seqWidth, batchCount); } CHECK_SYNC("hl_sequence2batch_add failed"); } -template -__global__ -void KeSequence2BatchPadding(real* batch, - real* sequence, - const int* sequenceStartPositions, - const size_t sequenceWidth, - const size_t maxSequenceLength, - const size_t numSequences) { +template +__global__ void KeSequence2BatchPadding(real* batch, + real* sequence, + const int* sequenceStartPositions, + const size_t sequenceWidth, + const size_t maxSequenceLength, + const size_t numSequences) { int batchIdx = blockIdx.y; int sequenceStart = sequenceStartPositions[batchIdx]; int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart; @@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch, if (seq2batch) { /* sequence -> batch */ if (normByTimes) { - KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( - batch, sequence, sequenceStartPositions, - sequenceWidth, maxSequenceLength, numSequences); + KeSequence2BatchPadding<1, 1><<>>( + batch, + sequence, + sequenceStartPositions, + sequenceWidth, + maxSequenceLength, + numSequences); } else { - KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( - batch, sequence, sequenceStartPositions, - sequenceWidth, maxSequenceLength, numSequences); + KeSequence2BatchPadding<0, 1><<>>( + batch, + sequence, + sequenceStartPositions, + sequenceWidth, + maxSequenceLength, + numSequences); } } else { /* batch -> sequence */ if (normByTimes) { - KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( - batch, sequence, sequenceStartPositions, - sequenceWidth, maxSequenceLength, numSequences); + KeSequence2BatchPadding<1, 0><<>>( + batch, + sequence, + sequenceStartPositions, + sequenceWidth, + maxSequenceLength, + numSequences); } else { - KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( - batch, sequence, sequenceStartPositions, - sequenceWidth, maxSequenceLength, numSequences); + KeSequence2BatchPadding<0, 0><<>>( + batch, + sequence, + sequenceStartPositions, + sequenceWidth, + maxSequenceLength, + numSequences); } } CHECK_SYNC("hl_sequence2batch_copy_padding failed"); } -__device__ inline float my_rsqrt(float x) { - return rsqrtf(x); -} +__device__ inline float my_rsqrt(float x) { return rsqrtf(x); } -__device__ inline double my_rsqrt(double x) { - return rsqrt(x); -} +__device__ inline double my_rsqrt(double x) { return rsqrt(x); } __global__ void KeSequenceAvgForward(real* dst, real* src, @@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst, for (int i = start; i < end; i++) { sum += src[i * width + col]; } - sum = mode == 1 ? sum : - (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength)); + sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength + : sum * my_rsqrt((real)seqLength)); dst[gid] += sum; } } @@ -347,10 +355,10 @@ void hl_sequence_avg_forward(real* dst, int grid = DIVUP(width * height, 512); CHECK(mode == 0 || mode == 1 || mode == 2) - << "mode error in hl_sequence_avg_forward!"; + << "mode error in hl_sequence_avg_forward!"; - KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>> - (dst, src, starts, height, width, mode); + KeSequenceAvgForward<<>>( + dst, src, starts, height, width, mode); CHECK_SYNC("hl_sequence_avg_forward failed"); } @@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst, int seqLength = end - start; if (seqLength == 0) return; real grad = src[gid]; - grad = mode == 1 ? grad : - (mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength)); + grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength + : grad * my_rsqrt((real)seqLength)); for (int i = start; i < end; i++) { dst[i * width + col] += grad; } @@ -392,9 +400,9 @@ void hl_sequence_avg_backward(real* dst, int grid = DIVUP(width * height, 512); CHECK(mode == 0 || mode == 1 || mode == 2) - << "mode error in hl_sequence_avg_backward!"; + << "mode error in hl_sequence_avg_backward!"; - KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>> - (dst, src, starts, height, width, mode); + KeSequenceAvgBackward<<>>( + dst, src, starts, height, width, mode); CHECK_SYNC("hl_sequence_avg_backward failed"); } diff --git a/paddle/cuda/src/hl_cuda_sparse.cu b/paddle/cuda/src/hl_cuda_sparse.cu index ab9ab57c884137f117c25c2752b5603b2e8b7135..6351e7e01ee55b6303a6e48bc9ebf9834a83130e 100644 --- a/paddle/cuda/src/hl_cuda_sparse.cu +++ b/paddle/cuda/src/hl_cuda_sparse.cu @@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "hl_cuda.h" +#include "hl_cuda_sparse.cuh" +#include "hl_matrix_apply.cuh" +#include "hl_matrix_ops.cuh" #include "hl_sparse.h" #include "hl_sparse.ph" -#include "hl_matrix_ops.cuh" -#include "hl_matrix_apply.cuh" -#include "hl_cuda_sparse.cuh" #include "paddle/utils/Logging.h" DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p); @@ -34,15 +33,15 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d, CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!"; if (A_d->nnz == 0) { - hl_gpu_apply_unary_op( - unary::Zero(), C_d, dimM, dimN, dimN); + hl_gpu_apply_unary_op(unary::Zero(), C_d, dimM, dimN, dimN); return; } /* nnz != 0 */ hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix); - CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && - A_d2->csr_row && A_d2->csr_col) << "parameter transa error!"; + CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && A_d2->csr_row && + A_d2->csr_col) + << "parameter transa error!"; int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; @@ -50,21 +49,11 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d, dim3 grid(blocksX, blocksY); if (A_d->type == HL_NO_VALUE) { - KeSMatrixCsr2Dense<0> - <<>>(A_d2->csr_val, - A_d2->csr_row, - A_d2->csr_col, - C_d, - dimM, - dimN); + KeSMatrixCsr2Dense<0><<>>( + A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN); } else if (A_d->type == HL_FLOAT_VALUE) { - KeSMatrixCsr2Dense<1> - <<>>(A_d2->csr_val, - A_d2->csr_row, - A_d2->csr_col, - C_d, - dimM, - dimN); + KeSMatrixCsr2Dense<1><<>>( + A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN); } else { } CHECK_SYNC("hl_matrix_csr2dense failed"); @@ -80,15 +69,15 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d, CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!"; if (A_d->nnz == 0) { - hl_gpu_apply_unary_op( - unary::Zero(), C_d, dimM, dimN, dimN); + hl_gpu_apply_unary_op(unary::Zero(), C_d, dimM, dimN, dimN); return; } /* nnz != 0 */ hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix); - CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && - A_d2->csc_row && A_d2->csc_col) << "parameter transa error!"; + CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && A_d2->csc_row && + A_d2->csc_col) + << "parameter transa error!"; int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X; @@ -96,21 +85,11 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d, dim3 grid(blocksX, blocksY); if (A_d->type == HL_NO_VALUE) { - KeSMatrixCsc2Dense<0> - <<>>(A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - C_d, - dimM, - dimN); + KeSMatrixCsc2Dense<0><<>>( + A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN); } else if (A_d->type == HL_FLOAT_VALUE) { - KeSMatrixCsc2Dense<1> - <<>>(A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - C_d, - dimM, - dimN); + KeSMatrixCsc2Dense<1><<>>( + A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN); } else { } CHECK_SYNC("hl_matrix_csc2dense failed"); @@ -118,43 +97,43 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d, void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz) { CHECK_NOTNULL(A_d); CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC) - << "sparse matrix format error!"; + << "sparse matrix format error!"; CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE) - << "sparse matrix value type error!"; + << "sparse matrix value type error!"; /* avoid malloc 0 bytes */ int nnz_s = (nnz == 0 ? 1 : nnz); if (format == HL_SPARSE_CSR) { CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csr_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix)); CHECK_NOTNULL(tmp); - hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); + hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); csr->sparsity = -1.0; if (value_type == HL_NO_VALUE) { csr->csr_val = NULL; csr->nnz_s = nnz_s; - csr->row_s = dimM+1; - csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int)); - csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int)); + csr->row_s = dimM + 1; + csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int)); + csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int)); *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csr; } else if (value_type == HL_FLOAT_VALUE) { csr->nnz_s = nnz_s; - csr->row_s = dimM+1; - csr->csr_val = (real*)hl_malloc_device((nnz_s)*sizeof(real)); - csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int)); - csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int)); + csr->row_s = dimM + 1; + csr->csr_val = (real *)hl_malloc_device((nnz_s) * sizeof(real)); + csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int)); + csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int)); *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csr; @@ -162,28 +141,28 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, } else if (format == HL_SPARSE_CSC) { CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csc_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix)); CHECK_NOTNULL(tmp); - hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); + hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); csc->sparsity = -1.0f; if (value_type == HL_NO_VALUE) { csc->csc_val = NULL; csc->nnz_s = nnz_s; - csc->col_s = dimN+1; - csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int)); - csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int)); + csc->col_s = dimN + 1; + csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int)); + csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int)); *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csc; } else if (value_type == HL_FLOAT_VALUE) { csc->nnz_s = nnz_s; - csc->col_s = dimN+1; - csc->csc_val = (real*)hl_malloc_device((nnz_s)*sizeof(real)); - csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int)); - csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int)); + csc->col_s = dimN + 1; + csc->csc_val = (real *)hl_malloc_device((nnz_s) * sizeof(real)); + csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int)); + csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int)); *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csc; @@ -200,7 +179,7 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) { CHECK_NOTNULL(A_d); CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC) - << "sparse matrix format error!"; + << "sparse matrix format error!"; if (A_d->matrix == NULL) { free(A_d); @@ -249,77 +228,77 @@ void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) { } void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, - void * dest_d, + void *dest_d, size_t size, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz) { CHECK_NOTNULL(A_d); CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC) - << "sparse matrix format error!"; + << "sparse matrix format error!"; if (format == HL_SPARSE_CSR) { CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; - size_t size_ = (dimM+1)*sizeof(int) + nnz*sizeof(int); + size_t size_ = (dimM + 1) * sizeof(int) + nnz * sizeof(int); if (value_type != HL_NO_VALUE) { - size_ += nnz*sizeof(real); + size_ += nnz * sizeof(real); } CHECK_LE(size_, size) << "dest_d size(" << size - << ") too small, should bigger than(" << size_ << ")!"; + << ") too small, should bigger than(" << size_ + << ")!"; - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csr_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix)); CHECK_NOTNULL(tmp); - hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); + hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); if (value_type == HL_NO_VALUE) { csr->csr_val = NULL; - csr->csr_row = (int*)dest_d; - csr->csr_col = (int*)((char*)dest_d + (dimM+1)*sizeof(int)); + csr->csr_row = (int *)dest_d; + csr->csr_col = (int *)((char *)dest_d + (dimM + 1) * sizeof(int)); } else { - csr->csr_val = (real*)dest_d; - csr->csr_row = (int*)((char*)dest_d + nnz*sizeof(real)); - csr->csr_col = (int*)((char*)dest_d + - nnz*sizeof(real) + - (dimM+1)*sizeof(int)); + csr->csr_val = (real *)dest_d; + csr->csr_row = (int *)((char *)dest_d + nnz * sizeof(real)); + csr->csr_col = (int *)((char *)dest_d + nnz * sizeof(real) + + (dimM + 1) * sizeof(int)); } csr->nnz_s = nnz; - csr->row_s = dimM+1; + csr->row_s = dimM + 1; csr->sparsity = -1.0; *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csr; } else if (format == HL_SPARSE_CSC) { CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; - size_t size_ = (dimN+1)*sizeof(int) + nnz*sizeof(int); + size_t size_ = (dimN + 1) * sizeof(int) + nnz * sizeof(int); if (value_type != HL_NO_VALUE) { - size_ += nnz*sizeof(real); + size_ += nnz * sizeof(real); } CHECK_LE(size_, size) << "dest_d size(" << size - << ") too small, should bigger than(" << size_ << ")!"; + << ") too small, should bigger than(" << size_ + << ")!"; - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csc_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix)); CHECK_NOTNULL(tmp); - hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s)); + hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); if (value_type == HL_NO_VALUE) { csc->csc_val = NULL; - csc->csc_col = (int*)dest_d; - csc->csc_row = (int*)((char*)dest_d + (dimN+1)*sizeof(int)); + csc->csc_col = (int *)dest_d; + csc->csc_row = (int *)((char *)dest_d + (dimN + 1) * sizeof(int)); } else { - csc->csc_val = (real*)dest_d; - csc->csc_col = (int*)((char*)dest_d + nnz*sizeof(real)); - csc->csc_row = (int*)((char*)dest_d + - nnz*sizeof(real) + - (dimN+1)*sizeof(int)); + csc->csc_val = (real *)dest_d; + csc->csc_col = (int *)((char *)dest_d + nnz * sizeof(real)); + csc->csc_row = (int *)((char *)dest_d + nnz * sizeof(real) + + (dimN + 1) * sizeof(int)); } csc->nnz_s = nnz; - csc->col_s = dimN+1; + csc->col_s = dimN + 1; csc->sparsity = -1.0f; *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csc; @@ -333,11 +312,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, } void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, - real* value_d, - int* rows_d, - int* cols_d, + real *value_d, + int *rows_d, + int *cols_d, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz) { @@ -345,11 +324,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!"; CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC) - << "sparse matrix format error!"; + << "sparse matrix format error!"; if (format == HL_SPARSE_CSR) { - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csr_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix)); CHECK_NOTNULL(tmp); hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); @@ -362,8 +341,8 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, *A_d = (hl_sparse_matrix_s)tmp; (*A_d)->matrix = (hl_matrix_s)csr; } else if (format == HL_SPARSE_CSC) { - char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s) - + sizeof(_hl_csc_matrix)); + char *tmp = + (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix)); CHECK_NOTNULL(tmp); hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s)); @@ -396,35 +375,30 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix, hl_stream_t stream) { CHECK_NOTNULL(csr_matrix); CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR) - << "csr_matrix is not csr format!"; + << "csr_matrix is not csr format!"; CHECK_NOTNULL(csr_matrix->matrix); hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix); - CHECK_LE(csr_matrix->nnz, csr->nnz_s) - << "copy size " << csr_matrix->nnz - << " is big than alloc size " << csr->nnz_s; + CHECK_LE(csr_matrix->nnz, csr->nnz_s) << "copy size " << csr_matrix->nnz + << " is big than alloc size " + << csr->nnz_s; - CHECK_LE((csr_matrix->rows+1), csr->row_s) - << "copy size " << (csr_matrix->rows + 1) - << " is big than alloc size " << csr->row_s; + CHECK_LE((csr_matrix->rows + 1), csr->row_s) + << "copy size " << (csr_matrix->rows + 1) << " is big than alloc size " + << csr->row_s; - CHECK(csr_matrix->type == HL_FLOAT_VALUE || - csr_matrix->type == HL_NO_VALUE) - << "sparse matrix value type error!"; + CHECK(csr_matrix->type == HL_FLOAT_VALUE || csr_matrix->type == HL_NO_VALUE) + << "sparse matrix value type error!"; if (csr_matrix->type == HL_NO_VALUE) { if (csr_row == NULL && csr_col == NULL) { return; } else if (csr_row != NULL && csr_col != NULL) { - hl_memcpy_async(csr->csr_row, - csr_row, - (csr_matrix->rows+1)*sizeof(int), - stream); + hl_memcpy_async( + csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream); - hl_memcpy_async(csr->csr_col, - csr_col, - (csr_matrix->nnz)*sizeof(int), - stream); + hl_memcpy_async( + csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream); } else { LOG(FATAL) << "parameter csr_row or csr_col is null pointer!"; } @@ -432,30 +406,21 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix, if (csr_val == NULL && csr_row == NULL && csr_col == NULL) { return; } else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) { - hl_memcpy_async(csr->csr_val, - csr_val, - (csr_matrix->nnz)*sizeof(real), - stream); + hl_memcpy_async( + csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream); } else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) { - hl_memcpy_async(csr->csr_val, - csr_val, - (csr_matrix->nnz)*sizeof(real), - stream); - hl_memcpy_async(csr->csr_row, - csr_row, - (csr_matrix->rows+1)*sizeof(int), - stream); - hl_memcpy_async(csr->csr_col, - csr_col, - (csr_matrix->nnz)*sizeof(int), - stream); + hl_memcpy_async( + csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream); + hl_memcpy_async( + csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream); + hl_memcpy_async( + csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream); } else { LOG(FATAL) << "parameter csr_row or csr_col is null pointer!"; } } - csr->sparsity = ((float)csr_matrix->nnz) / - ((float)csr_matrix->rows) / + csr->sparsity = ((float)csr_matrix->nnz) / ((float)csr_matrix->rows) / ((float)csr_matrix->cols); } @@ -466,33 +431,28 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix, hl_stream_t stream) { CHECK_NOTNULL(csc_matrix); CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC) - << "csc_matrix is not csc format error!"; + << "csc_matrix is not csc format error!"; hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix); - CHECK_LE(csc_matrix->nnz, csc->nnz_s) - << "copy size " << csc_matrix->nnz - << " is big than alloc size " << csc->nnz_s; + CHECK_LE(csc_matrix->nnz, csc->nnz_s) << "copy size " << csc_matrix->nnz + << " is big than alloc size " + << csc->nnz_s; - CHECK_LE((csc_matrix->cols+1), csc->col_s) - << "copy size " <<(csc_matrix->cols + 1) - << " is big than alloc size " << csc->col_s; + CHECK_LE((csc_matrix->cols + 1), csc->col_s) + << "copy size " << (csc_matrix->cols + 1) << " is big than alloc size " + << csc->col_s; - CHECK(csc_matrix->type == HL_FLOAT_VALUE || - csc_matrix->type == HL_NO_VALUE) - << "sparse matrix value type error!"; + CHECK(csc_matrix->type == HL_FLOAT_VALUE || csc_matrix->type == HL_NO_VALUE) + << "sparse matrix value type error!"; if (csc_matrix->type == HL_NO_VALUE) { if (csc_row == NULL && csc_col == NULL) { return; } else if (csc_row != NULL && csc_col != NULL) { - hl_memcpy_async(csc->csc_row, - csc_row, - (csc_matrix->nnz)*sizeof(int), - stream); - hl_memcpy_async(csc->csc_col, - csc_col, - (csc_matrix->cols+1)*sizeof(int), - stream); + hl_memcpy_async( + csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream); + hl_memcpy_async( + csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream); } else { LOG(FATAL) << "parameter csc_row or csc_col is null pointer!"; } @@ -500,30 +460,21 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix, if (csc_val == NULL && csc_row == NULL && csc_col == NULL) { return; } else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) { - hl_memcpy_async(csc->csc_val, - csc_val, - (csc_matrix->nnz)*sizeof(real), - stream); + hl_memcpy_async( + csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream); } else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) { - hl_memcpy_async(csc->csc_val, - csc_val, - (csc_matrix->nnz)*sizeof(real), - stream); - hl_memcpy_async(csc->csc_row, - csc_row, - (csc_matrix->nnz)*sizeof(int), - stream); - hl_memcpy_async(csc->csc_col, - csc_col, - (csc_matrix->cols+1)*sizeof(int), - stream); + hl_memcpy_async( + csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream); + hl_memcpy_async( + csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream); + hl_memcpy_async( + csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream); } else { LOG(FATAL) << "parameter csc_row or csc_col is null pointer!"; } } - csc->sparsity = ((float)csc_matrix->nnz) / - ((float)csc_matrix->rows) / + csc->sparsity = ((float)csc_matrix->nnz) / ((float)csc_matrix->rows) / ((float)csc_matrix->cols); } @@ -531,32 +482,23 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst, hl_sparse_matrix_s src, hl_stream_t stream) { CHECK(dst && src && dst->matrix && src->matrix) - << "parameter dst or src is null pointer!"; - CHECK_EQ(dst->format, src->format) - << "sparse matrix format does not match!"; + << "parameter dst or src is null pointer!"; + CHECK_EQ(dst->format, src->format) << "sparse matrix format does not match!"; CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE) - << "src sparse matrix is no value, dst sparse matrix has value!"; + << "src sparse matrix is no value, dst sparse matrix has value!"; if (dst->format == HL_SPARSE_CSR) { dst->rows = src->rows; dst->cols = src->cols; - dst->nnz = src->nnz; + dst->nnz = src->nnz; hl_csr_matrix csr = (hl_csr_matrix)src->matrix; - hl_memcpy_csr_matrix(dst, - csr->csr_val, - csr->csr_row, - csr->csr_col, - stream); + hl_memcpy_csr_matrix(dst, csr->csr_val, csr->csr_row, csr->csr_col, stream); } else if (dst->format == HL_SPARSE_CSC) { dst->rows = src->rows; dst->cols = src->cols; - dst->nnz = src->nnz; + dst->nnz = src->nnz; hl_csc_matrix csc = (hl_csc_matrix)src->matrix; - hl_memcpy_csc_matrix(dst, - csc->csc_val, - csc->csc_row, - csc->csc_col, - stream); + hl_memcpy_csc_matrix(dst, csc->csc_val, csc->csc_row, csc->csc_col, stream); } else { LOG(FATAL) << "sparse matrix format error!"; } @@ -569,20 +511,24 @@ static void _beta_mul_c(real *c, int dimM, int dimN, real beta) { if (beta == 0.0) { hl_gpu_apply_unary_op(unary::Zero(), c, dimM, dimN, dimN); } else { - if (beta != 1.0){ - hl_gpu_apply_unary_op( - unary::mul_scalar(beta), c, dimM, dimN, dimN); + if (beta != 1.0) { + hl_gpu_apply_unary_op(unary::mul_scalar(beta), c, dimM, dimN, dimN); } } return; } -void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, +void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_EQ(transb, HPPL_OP_N); CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -592,7 +538,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) || (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) { - LOG(FATAL) << "parameter error!"; + LOG(FATAL) << "parameter error!"; } if (A_d->nnz == 0) { @@ -603,8 +549,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, /* nnz != 0 */ hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix); if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) || - A_d2->csr_row == NULL || - A_d2->csr_col == NULL) { + A_d2->csr_row == NULL || A_d2->csr_col == NULL) { LOG(FATAL) << "parameter error!"; } @@ -617,63 +562,63 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, /* sparsity pattern */ // A_d->sparsity; if (A_d->type == HL_NO_VALUE) { - KeSMatrixCsrMulDense<0> - <<>>(C_d, - A_d2->csr_val, - A_d2->csr_col, - A_d2->csr_row, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCsrMulDense<0><<>>( + C_d, + A_d2->csr_val, + A_d2->csr_col, + A_d2->csr_row, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixCsrMulDense<1> - <<>>(C_d, - A_d2->csr_val, - A_d2->csr_col, - A_d2->csr_row, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCsrMulDense<1><<>>( + C_d, + A_d2->csr_val, + A_d2->csr_col, + A_d2->csr_row, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } } else if (HPPL_OP_T == transa) { _beta_mul_c(C_d, dimM, dimN, beta); - int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / - CU_CSC_MUL_DENSE_BLOCK_N; - int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / - CU_CSC_MUL_DENSE_BLOCK_K; + int blocksX = + (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N; + int blocksY = + (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K; dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y); dim3 grid(blocksX, blocksY); if (A_d->type == HL_NO_VALUE) { - KeSMatrixCscMulDense<0> - <<>>(C_d, - A_d2->csr_val, - A_d2->csr_col, - A_d2->csr_row, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCscMulDense<0><<>>( + C_d, + A_d2->csr_val, + A_d2->csr_col, + A_d2->csr_row, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixCscMulDense<1> - <<>>(C_d, - A_d2->csr_val, - A_d2->csr_col, - A_d2->csr_row, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCscMulDense<1><<>>( + C_d, + A_d2->csr_val, + A_d2->csr_col, + A_d2->csr_row, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } } else { LOG(FATAL) << "parameter transa error!"; @@ -682,11 +627,16 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, CHECK_SYNC("hl_matrix_csr_mul_dense failed"); } -void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, - hl_sparse_matrix_s B_d, hl_trans_op_t transb, +void hl_matrix_dense_mul_csc(real *A_d, + hl_trans_op_t transa, + hl_sparse_matrix_s B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_EQ(transa, HPPL_OP_N); CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -698,8 +648,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, LOG(FATAL) << "parameter dims error!"; } - CHECK_EQ(B_d->format, HL_SPARSE_CSC) - << "matrix format error!"; + CHECK_EQ(B_d->format, HL_SPARSE_CSC) << "matrix format error!"; if (B_d->nnz == 0) { _beta_mul_c(C_d, dimM, dimN, beta); @@ -709,8 +658,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, /* nnz != 0 */ hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix); if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) || - B_d2->csc_row == NULL || - B_d2->csc_col == NULL) { + B_d2->csc_row == NULL || B_d2->csc_col == NULL) { LOG(FATAL) << "parameter B is null!"; } @@ -721,60 +669,60 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, dim3 grid(blocksX, blocksY); if (B_d->type == HL_NO_VALUE) { - KeSMatrixDenseMulCsc<0> - <<>>(C_d, - A_d, - B_d2->csc_val, - B_d2->csc_row, - B_d2->csc_col, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsc<0><<>>( + C_d, + A_d, + B_d2->csc_val, + B_d2->csc_row, + B_d2->csc_col, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixDenseMulCsc<1> - <<>>(C_d, - A_d, - B_d2->csc_val, - B_d2->csc_row, - B_d2->csc_col, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsc<1><<>>( + C_d, + A_d, + B_d2->csc_val, + B_d2->csc_row, + B_d2->csc_col, + dimM, + dimN, + dimK, + alpha, + beta); } } else if (transb == HPPL_OP_T) { _beta_mul_c(C_d, dimM, dimN, beta); - int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X; - int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M; + int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X; + int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M; dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y); dim3 grid(blocksX, blocksY); if (B_d->type == HL_NO_VALUE) { - KeSMatrixDenseMulCsr<0> - <<>>(C_d, - A_d, - B_d2->csc_val, - B_d2->csc_col, - B_d2->csc_row, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsr<0><<>>( + C_d, + A_d, + B_d2->csc_val, + B_d2->csc_col, + B_d2->csc_row, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixDenseMulCsr<1> - <<>>(C_d, - A_d, - B_d2->csc_val, - B_d2->csc_col, - B_d2->csc_row, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsr<1><<>>( + C_d, + A_d, + B_d2->csc_val, + B_d2->csc_col, + B_d2->csc_row, + dimM, + dimN, + dimK, + alpha, + beta); } } else { LOG(FATAL) << "parameter transb error!"; @@ -783,24 +731,28 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, CHECK_SYNC("hl_matrix_dense_mul_csc failed"); } -void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, - hl_sparse_matrix_s B_d, hl_trans_op_t transb, +void hl_matrix_dense_mul_csr(real *A_d, + hl_trans_op_t transa, + hl_sparse_matrix_s B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_EQ(transa, HPPL_OP_N); CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); CHECK_NOTNULL(C_d); - if (dimM <= 0 || dimN <= 0 || dimK <= 0 - || (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) - || (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) { + if (dimM <= 0 || dimN <= 0 || dimK <= 0 || + (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) || + (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) { LOG(FATAL) << "parameter dims error!"; } - CHECK_EQ(B_d->format, HL_SPARSE_CSR) - << "matrix format error!"; + CHECK_EQ(B_d->format, HL_SPARSE_CSR) << "matrix format error!"; if (B_d->nnz == 0) { _beta_mul_c(C_d, dimM, dimN, beta); @@ -810,41 +762,40 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, /* nnz != 0 */ hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix); if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) || - B_d2->csr_row == NULL || - B_d2->csr_col == NULL) { + B_d2->csr_row == NULL || B_d2->csr_col == NULL) { LOG(FATAL) << "parameter transa error!"; } if (transb == HPPL_OP_N) { _beta_mul_c(C_d, dimM, dimN, beta); - int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X; - int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M; + int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X; + int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M; dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y); dim3 grid(blocksX, blocksY); if (B_d->type == HL_NO_VALUE) { - KeSMatrixDenseMulCsr<0> - <<>>(C_d, - A_d, - B_d2->csr_val, - B_d2->csr_row, - B_d2->csr_col, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsr<0><<>>( + C_d, + A_d, + B_d2->csr_val, + B_d2->csr_row, + B_d2->csr_col, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixDenseMulCsr<1> - <<>>(C_d, - A_d, - B_d2->csr_val, - B_d2->csr_row, - B_d2->csr_col, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsr<1><<>>( + C_d, + A_d, + B_d2->csr_val, + B_d2->csr_row, + B_d2->csr_col, + dimM, + dimN, + dimK, + alpha, + beta); } } else if (transb == HPPL_OP_T) { int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST; @@ -852,29 +803,29 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST); dim3 grid(blocksX, blocksY); if (B_d->type == HL_NO_VALUE) { - KeSMatrixDenseMulCsc<0> - <<>>(C_d, - A_d, - B_d2->csr_val, - B_d2->csr_col, - B_d2->csr_row, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsc<0><<>>( + C_d, + A_d, + B_d2->csr_val, + B_d2->csr_col, + B_d2->csr_row, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixDenseMulCsc<1> - <<>>(C_d, - A_d, - B_d2->csr_val, - B_d2->csr_col, - B_d2->csr_row, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulCsc<1><<>>( + C_d, + A_d, + B_d2->csr_val, + B_d2->csr_col, + B_d2->csr_row, + dimM, + dimN, + dimK, + alpha, + beta); } } else { LOG(FATAL) << "parameter transb error!"; @@ -883,11 +834,16 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, CHECK_SYNC("hl_matrix_dense_mul_csr failed"); } -void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, +void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_EQ(transb, HPPL_OP_N); CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -908,42 +864,43 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, /* nnz != 0 */ hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix); if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) || - A_d2->csc_row == NULL || - A_d2->csc_col == NULL) { + A_d2->csc_row == NULL || A_d2->csc_col == NULL) { LOG(FATAL) << "parameter error!"; } if (HPPL_OP_N == transa) { _beta_mul_c(C_d, dimM, dimN, beta); - int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N -1)/CU_CSC_MUL_DENSE_BLOCK_N; - int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K -1)/CU_CSC_MUL_DENSE_BLOCK_K; + int blocksX = + (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N; + int blocksY = + (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K; dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y); dim3 grid(blocksX, blocksY); if (A_d->type == HL_NO_VALUE) { - KeSMatrixCscMulDense<0> - <<>>(C_d, - A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCscMulDense<0><<>>( + C_d, + A_d2->csc_val, + A_d2->csc_row, + A_d2->csc_col, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixCscMulDense<1> - <<>>(C_d, - A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCscMulDense<1><<>>( + C_d, + A_d2->csc_val, + A_d2->csc_row, + A_d2->csc_col, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } } else if (HPPL_OP_T == transa) { int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N; @@ -954,29 +911,29 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, /* sparsity pattern */ // A_d->sparsity; if (A_d->type == HL_NO_VALUE) { - KeSMatrixCsrMulDense<0> - <<>>(C_d, - A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCsrMulDense<0><<>>( + C_d, + A_d2->csc_val, + A_d2->csc_row, + A_d2->csc_col, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } else { - KeSMatrixCsrMulDense<1> - <<>>(C_d, - A_d2->csc_val, - A_d2->csc_row, - A_d2->csc_col, - B_d, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixCsrMulDense<1><<>>( + C_d, + A_d2->csc_val, + A_d2->csc_row, + A_d2->csc_col, + B_d, + dimM, + dimN, + dimK, + alpha, + beta); } } else { LOG(FATAL) << "parameter transa error!"; @@ -985,11 +942,16 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, CHECK_SYNC("hl_matrix_csc_mul_dense failed"); } -void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, - hl_sparse_matrix_s C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { +void hl_sparse_matrix_mul(real *A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, + hl_sparse_matrix_s C_d, + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); CHECK_NOTNULL(C_d); @@ -1000,18 +962,14 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, if (C_d->format == HL_SPARSE_CSC) { hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix); - if (C_d2->csc_val == NULL || - C_d2->csc_row == NULL || + if (C_d2->csc_val == NULL || C_d2->csc_row == NULL || C_d2->csc_col == NULL) { LOG(FATAL) << "parameter error!"; } if (beta != 1.0) { - hl_gpu_apply_unary_op(unary::mul_scalar(beta), - C_d2->csc_val, - 1, - C_d->nnz, - C_d->nnz); + hl_gpu_apply_unary_op( + unary::mul_scalar(beta), C_d2->csc_val, 1, C_d->nnz, C_d->nnz); } int blocksX = dimN; @@ -1020,34 +978,30 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, dim3 grid(blocksX, blocksY); bool transA = transa == HPPL_OP_T ? 1 : 0; bool transB = transb == HPPL_OP_T ? 1 : 0; - KeSMatrixDenseMulDense2CSC - <<>>(C_d2->csc_val, - C_d2->csc_row, - C_d2->csc_col, - A_d, - B_d, - transA, - transB, - dimM, - dimN, - dimK, - alpha, - beta); + KeSMatrixDenseMulDense2CSC<<>>( + C_d2->csc_val, + C_d2->csc_row, + C_d2->csc_col, + A_d, + B_d, + transA, + transB, + dimM, + dimN, + dimK, + alpha, + beta); CHECK_SYNC("hl_sparse_matrix_mul failed"); } else { hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix); if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) || - C_d2->csr_row == NULL || - C_d2->csr_col == NULL) { + C_d2->csr_row == NULL || C_d2->csr_col == NULL) { LOG(FATAL) << "parameter error!"; } if (beta != 1.0) { - hl_gpu_apply_unary_op(unary::mul_scalar(beta), - C_d2->csr_val, - 1, - C_d->nnz, - C_d->nnz); + hl_gpu_apply_unary_op( + unary::mul_scalar(beta), C_d2->csr_val, 1, C_d->nnz, C_d->nnz); } bool transA = transa == HPPL_OP_T ? 1 : 0; @@ -1058,20 +1012,20 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1); dim3 grid(blocksX, blocksY); - KeSMatrixDenseMulDense2CSR - <<>>(C_d2->csr_val, - C_d2->csr_row, - C_d2->csr_col, - A_d, - B_d, - transA, - transB, - dimM, - dimN, - dimK, - alpha, - beta); - CHECK_SYNC("hl_sparse_matrix_mul failed"); + KeSMatrixDenseMulDense2CSR<<>>( + C_d2->csr_val, + C_d2->csr_row, + C_d2->csr_col, + A_d, + B_d, + transA, + transB, + dimM, + dimN, + dimK, + alpha, + beta); + CHECK_SYNC("hl_sparse_matrix_mul failed"); } else { CHECK(!transA) << "Not supported A is trans and B is not trans!"; @@ -1080,21 +1034,21 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1; int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE); dim3 grid(gridx, dimM); - KeSMatrixDenseMulDenseTrans2CSR - <<>>(C_d2->csr_val, - C_d2->csr_row, - C_d2->csr_col, - A_d, - B_d, - transA, - transB, - dimM, - dimN, - dimK, - alpha, - beta); - CHECK_SYNC("hl_sparse_matrix_mul failed"); - } + KeSMatrixDenseMulDenseTrans2CSR<<>>( + C_d2->csr_val, + C_d2->csr_row, + C_d2->csr_col, + A_d, + B_d, + transA, + transB, + dimM, + dimN, + dimK, + alpha, + beta); + CHECK_SYNC("hl_sparse_matrix_mul failed"); + } } } @@ -1111,7 +1065,7 @@ void hl_memcpy_from_csc_matrix(real *csc_val, CHECK_NOTNULL(csc_col); CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC) - << "csc_matrix is not csc format error!"; + << "csc_matrix is not csc format error!"; if (csc_matrix->nnz > row_size || csc_matrix->cols + 1 > static_cast(col_size)) { @@ -1119,20 +1073,20 @@ void hl_memcpy_from_csc_matrix(real *csc_val, } hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix); - hl_memcpy_async((void*)csc_row, - (void*)csc->csc_row, + hl_memcpy_async((void *)csc_row, + (void *)csc->csc_row, (csc_matrix->nnz) * sizeof(int), stream); - hl_memcpy_async((void*)csc_col, - (void*)csc->csc_col, + hl_memcpy_async((void *)csc_col, + (void *)csc->csc_col, (csc_matrix->cols + 1) * sizeof(int), stream); if (csc_matrix->type == HL_FLOAT_VALUE) { if (csc_val != NULL) { CHECK_LE(csc_matrix->nnz, val_size) << "size not match!"; - hl_memcpy_async((void*)csc_val, - (void*)csc->csc_val, - (csc_matrix->nnz)*sizeof(real), + hl_memcpy_async((void *)csc_val, + (void *)csc->csc_val, + (csc_matrix->nnz) * sizeof(real), stream); } else { LOG(FATAL) << "parameter csr_val is null pointer!"; @@ -1152,7 +1106,7 @@ void hl_memcpy_from_csr_matrix(real *csr_val, CHECK_NOTNULL(csr_row); CHECK_NOTNULL(csr_col); CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR) - << "csr_matrix is not csr format error!"; + << "csr_matrix is not csr format error!"; if (csr_matrix->nnz > col_size || csr_matrix->rows + 1 > static_cast(row_size)) { @@ -1160,20 +1114,20 @@ void hl_memcpy_from_csr_matrix(real *csr_val, } hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix); - hl_memcpy_async((void*)csr_row, - (void*)csr->csr_row, - (csr_matrix->rows+1)*sizeof(int), + hl_memcpy_async((void *)csr_row, + (void *)csr->csr_row, + (csr_matrix->rows + 1) * sizeof(int), stream); - hl_memcpy_async((void*)csr_col, - (void*)csr->csr_col, - (csr_matrix->nnz)*sizeof(int), + hl_memcpy_async((void *)csr_col, + (void *)csr->csr_col, + (csr_matrix->nnz) * sizeof(int), stream); if (csr_matrix->type == HL_FLOAT_VALUE) { if (csr_val != NULL) { CHECK_LE(csr_matrix->nnz, val_size) << "size not match!"; - hl_memcpy_async((void*)csr_val, - (void*)csr->csr_val, - (csr_matrix->nnz)*sizeof(real), + hl_memcpy_async((void *)csr_val, + (void *)csr->csr_val, + (csr_matrix->nnz) * sizeof(real), stream); } else { LOG(FATAL) << "parameter csr_val is null pointer!"; @@ -1181,8 +1135,8 @@ void hl_memcpy_from_csr_matrix(real *csr_val, } } -void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM, - int dimN, real scale) { +void hl_sparse_matrix_column_sum( + real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) { if (B_d->format == HL_SPARSE_CSR) { hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale); } else { @@ -1190,8 +1144,8 @@ void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM, } } -void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d, - int dimM, int dimN, real scale) { +void hl_matrix_csr_column_sum( + real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -1216,8 +1170,7 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d, CHECK_SYNC("hl_matrix_csr_column_sum failed"); } -void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, - real* B_d, real scale) { +void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) { if (A_d->format == HL_SPARSE_CSR) { hl_matrix_csr_add_bias(A_d, B_d, scale); } else { @@ -1225,8 +1178,7 @@ void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, } } -void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d, - real scale) { +void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -1247,8 +1199,12 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d, CHECK_SYNC("hl_sparse_matrix_add_bias failed"); } -void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM, - int dimN, real alpha, real beta) { +void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, + real *B_d, + int dimM, + int dimN, + real alpha, + real beta) { if (A_d->format == HL_SPARSE_CSR) { hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta); } else { @@ -1256,8 +1212,12 @@ void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM, } } -void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM, - int dimN, real alpha, real beta) { +void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, + real *B_d, + int dimM, + int dimN, + real alpha, + real beta) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); @@ -1277,20 +1237,26 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM, gridX = gridX > 0 ? gridX : 1; dim3 block(512, 1); dim3 grid(gridX, dimM); - KeSMatrixCsrAddDense<<>>( - A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, B_d, alpha, beta, dimM, dimN); + KeSMatrixCsrAddDense<<>>(A_d2->csr_val, + A_d2->csr_row, + A_d2->csr_col, + B_d, + alpha, + beta, + dimM, + dimN); CHECK_SYNC("hl_sparse_matrix_add_dense failed"); } -int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { +int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { __sparse_get_return__(sMat, row); } -int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { +int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { __sparse_get_return__(sMat, col); } -real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) { +real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) { __sparse_get_return__(sMat, val); } diff --git a/paddle/cuda/src/hl_perturbation_util.cu b/paddle/cuda/src/hl_perturbation_util.cu index 2a945bcdb87fe49c121890128ef77b084ebe8e60..d01a91561efa2ebe8e0cabc2b4e8885f2c02ab48 100644 --- a/paddle/cuda/src/hl_perturbation_util.cu +++ b/paddle/cuda/src/hl_perturbation_util.cu @@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#include #include -#include "hl_cuda.h" -#include "hl_time.h" +#include #include "hl_base.h" +#include "hl_cuda.h" #include "hl_perturbation_util.cuh" +#include "hl_time.h" #define _USE_MATH_DEFINES @@ -30,10 +29,16 @@ limitations under the License. */ * centerX, centerY: translation. * sourceX, sourceY: output coordinates in the original image. */ -__device__ void getTranformCoord(int x, int y, real theta, real scale, - real tgtCenter, real imgCenter, - real centerR, real centerC, - int* sourceX, int* sourceY) { +__device__ void getTranformCoord(int x, + int y, + real theta, + real scale, + real tgtCenter, + real imgCenter, + real centerR, + real centerC, + int* sourceX, + int* sourceY) { real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)}; // compute coornidates in the rotated and scaled image @@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale, * created by Wei Xu (genome), converted by Jiang Wang */ -__global__ void kSamplingPatches(const real* imgs, real* targets, - int imgSize, int tgtSize, const int channels, - int samplingRate, const real* thetas, - const real* scales, const int* centerRs, - const int* centerCs, const real padValue, +__global__ void kSamplingPatches(const real* imgs, + real* targets, + int imgSize, + int tgtSize, + const int channels, + int samplingRate, + const real* thetas, + const real* scales, + const int* centerRs, + const int* centerCs, + const real padValue, const int numImages) { const int caseIdx = blockIdx.x * 4 + threadIdx.x; const int pxIdx = blockIdx.y * 128 + threadIdx.y; @@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, const int pxY = pxIdx / tgtSize; int srcPxX, srcPxY; - getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter, - imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX, + getTranformCoord(pxX, + pxY, + thetas[imgIdx], + scales[imgIdx], + tgtCenter, + imgCenter, + centerCs[caseIdx], + centerRs[caseIdx], + &srcPxX, &srcPxY); imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels; @@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, * * created by Wei Xu */ -void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, - int*& gpuCenterR, int*& gpuCenterC, - int numImages, int imgSize, real rotateAngle, - real scaleRatio, int samplingRate, +void hl_generate_disturb_params(real*& gpuAngle, + real*& gpuScaleRatio, + int*& gpuCenterR, + int*& gpuCenterC, + int numImages, + int imgSize, + real rotateAngle, + real scaleRatio, + int samplingRate, bool isTrain) { // The number of output samples. int numPatches = numImages * samplingRate; @@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, for (int i = 0; i < numImages; i++) { r_angle[i] = (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT - - 0.5); + - + 0.5); s_ratio[i] = 1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT } @@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, int pxY = (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT - const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]), - sin(-r_angle[i]), cos(-r_angle[i])}; + const real H[4] = {cos(-r_angle[i]), + -sin(-r_angle[i]), + sin(-r_angle[i]), + cos(-r_angle[i])}; real x = pxX - imgCenter; real y = pxY - imgCenter; real xx = H[0] * x + H[1] * y; @@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, delete[] center_c; } -void hl_conv_random_disturb_with_params(const real* images, int imgSize, - int tgtSize, int channels, - int numImages, int samplingRate, +void hl_conv_random_disturb_with_params(const real* images, + int imgSize, + int tgtSize, + int channels, + int numImages, + int samplingRate, const real* gpuRotationAngle, const real* gpuScaleRatio, const int* gpuCenterR, @@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize, dim3 threadsPerBlock(4, 128); dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128)); - kSamplingPatches <<>> - (images, target, imgSize, tgtSize, channels, samplingRate, - gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, - paddingValue, numImages); + kSamplingPatches<<>>(images, + target, + imgSize, + tgtSize, + channels, + samplingRate, + gpuRotationAngle, + gpuScaleRatio, + gpuCenterR, + gpuCenterC, + paddingValue, + numImages); hl_device_synchronize(); } -void hl_conv_random_disturb(const real* images, int imgSize, - int tgtSize, int channels, int numImages, - real scaleRatio, real rotateAngle, - int samplingRate, real* gpu_r_angle, - real* gpu_s_ratio, int* gpu_center_r, - int* gpu_center_c, int paddingValue, - bool isTrain, real* targets) { +void hl_conv_random_disturb(const real* images, + int imgSize, + int tgtSize, + int channels, + int numImages, + real scaleRatio, + real rotateAngle, + int samplingRate, + real* gpu_r_angle, + real* gpu_s_ratio, + int* gpu_center_r, + int* gpu_center_c, + int paddingValue, + bool isTrain, + real* targets) { // generate the random disturbance sequence and the sampling locations - hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r, - gpu_center_c, numImages, imgSize, rotateAngle, - scaleRatio, samplingRate, isTrain); - - hl_conv_random_disturb_with_params( - images, imgSize, tgtSize, channels, numImages, - samplingRate, gpu_r_angle, gpu_s_ratio, - gpu_center_r, gpu_center_r, paddingValue, - targets); + hl_generate_disturb_params(gpu_r_angle, + gpu_s_ratio, + gpu_center_r, + gpu_center_c, + numImages, + imgSize, + rotateAngle, + scaleRatio, + samplingRate, + isTrain); + + hl_conv_random_disturb_with_params(images, + imgSize, + tgtSize, + channels, + numImages, + samplingRate, + gpu_r_angle, + gpu_s_ratio, + gpu_center_r, + gpu_center_r, + paddingValue, + targets); } diff --git a/paddle/cuda/src/hl_table_apply.cu b/paddle/cuda/src/hl_table_apply.cu index 61edbe3ccc7028fd8779c4119f33c4cb5afe0564..d3b71c75e6e69d48c8d98041e3d6075aa8d53610 100644 --- a/paddle/cuda/src/hl_table_apply.cu +++ b/paddle/cuda/src/hl_table_apply.cu @@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "hl_base.h" -#include "hl_device_functions.cuh" #include "hl_cuda.h" +#include "hl_device_functions.cuh" #include "paddle/utils/Logging.h" -template -__global__ void KeMatrixAddRows(real* output, int ldo, - real* table, int ldt, +template +__global__ void KeMatrixAddRows(real* output, + int ldo, + real* table, + int ldt, int* ids, int numSamples, int tableSize, @@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo, while (idy < numSamples) { int tableId = ids[idy]; if ((0 <= tableId) && (tableId < tableSize)) { - real *out = output + idy * ldo; - real *tab = table + tableId * ldt; + real* out = output + idy * ldo; + real* tab = table + tableId * ldt; for (int i = idx; i < dim; i += blockDimX) { if (AddRow) { paddle::paddleAtomicAdd(&tab[i], out[i]); @@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo, } } -void hl_matrix_select_rows(real* output, int ldo, - real* table, int ldt, +void hl_matrix_select_rows(real* output, + int ldo, + real* table, + int ldt, int* ids, int numSamples, int tableSize, @@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo, dim3 threads(128, 8); dim3 grid(8, 1); - KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>> - (output, ldo, table, ldt, ids, numSamples, tableSize, dim); + KeMatrixAddRows<128, 8, 8, 0><<>>( + output, ldo, table, ldt, ids, numSamples, tableSize, dim); CHECK_SYNC("hl_matrix_select_rows failed"); } -void hl_matrix_add_to_rows(real* table, int ldt, - real* input, int ldi, +void hl_matrix_add_to_rows(real* table, + int ldt, + real* input, + int ldi, int* ids, int numSamples, int tableSize, @@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt, dim3 threads(128, 8); dim3 grid(8, 1); - KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>> - (input, ldi, table, ldt, ids, numSamples, tableSize, dim); + KeMatrixAddRows<128, 8, 8, 1><<>>( + input, ldi, table, ldt, ids, numSamples, tableSize, dim); CHECK_SYNC("hl_matrix_add_to_rows failed"); } -template -__global__ void KeVectorSelect(T* dst, int sized, - const T* src, int sizes, - const int* ids, int sizei) { +template +__global__ void KeVectorSelect( + T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) { int idx = threadIdx.x + blockDimX * blockIdx.x; while (idx < sizei) { int index = ids[idx]; @@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized, } template -void hl_vector_select_from(T* dst, int sized, - const T* src, int sizes, - const int* ids, int sizei) { +void hl_vector_select_from( + T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) { CHECK_NOTNULL(dst); CHECK_NOTNULL(src); CHECK_NOTNULL(ids); @@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized, dim3 threads(512, 1); dim3 grid(8, 1); - KeVectorSelect<<< grid, threads, 0, STREAM_DEFAULT >>> - (dst, sized, src, sizes, ids, sizei); + KeVectorSelect<<>>( + dst, sized, src, sizes, ids, sizei); CHECK_SYNC("hl_vector_select_from failed"); } -template -void hl_vector_select_from(real* dst, int sized, - const real* src, int sizes, - const int* ids, int sizei); -template -void hl_vector_select_from(int* dst, int sized, - const int* src, int sizes, - const int* ids, int sizei); - +template void hl_vector_select_from(real* dst, + int sized, + const real* src, + int sizes, + const int* ids, + int sizei); +template void hl_vector_select_from( + int* dst, int sized, const int* src, int sizes, const int* ids, int sizei); diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu index 4f0bbfcf4e3aa51dd06acf254af65c62098a1df7..1896a56634c3a75e5a2a1e08661088b263f8ee10 100644 --- a/paddle/cuda/src/hl_top_k.cu +++ b/paddle/cuda/src/hl_top_k.cu @@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "hl_base.h" -#include "hl_top_k.h" #include "hl_sparse.ph" +#include "hl_top_k.h" #include "paddle/utils/Logging.h" // using namespace hppl; struct Pair { - __device__ __forceinline__ - Pair() {} + __device__ __forceinline__ Pair() {} - __device__ __forceinline__ - Pair(real value, int id) : v_(value), id_(id) {} + __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {} - __device__ __forceinline__ - void set(real value, int id) { + __device__ __forceinline__ void set(real value, int id) { v_ = value; id_ = id; } - __device__ __forceinline__ - void operator=(const Pair& in) { + __device__ __forceinline__ void operator=(const Pair& in) { v_ = in.v_; id_ = in.id_; } - __device__ __forceinline__ - bool operator<(const real value) const { + __device__ __forceinline__ bool operator<(const real value) const { return (v_ < value); } - __device__ __forceinline__ - bool operator<(const Pair& in) const { + __device__ __forceinline__ bool operator<(const Pair& in) const { return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_)); } - __device__ __forceinline__ - bool operator>(const Pair& in) const { + __device__ __forceinline__ bool operator>(const Pair& in) const { return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_)); } @@ -58,8 +50,9 @@ struct Pair { int id_; }; -__device__ __forceinline__ -void addTo(Pair topK[], const Pair &p, int beamSize) { +__device__ __forceinline__ void addTo(Pair topK[], + const Pair& p, + int beamSize) { for (int k = beamSize - 2; k >= 0; k--) { if (topK[k] < p) { topK[k + 1] = topK[k]; @@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) { topK[0] = p; } -template -__device__ __forceinline__ -void addTo(Pair topK[], const Pair &p) { +template +__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) { for (int k = beamSize - 2; k >= 0; k--) { if (topK[k] < p) { topK[k + 1] = topK[k]; @@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) { topK[0] = p; } -template -__device__ __forceinline__ -void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) { +template +__device__ __forceinline__ void getTopK( + Pair topK[], real* src, int idx, int dim, int beamSize) { while (idx < dim) { if (topK[beamSize - 1] < src[idx]) { Pair tmp(src[idx], idx); @@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) { } } -template -__device__ __forceinline__ -void getTopK(Pair topK[], real *src, int idx, int dim, - const Pair& max, int beamSize) { +template +__device__ __forceinline__ void getTopK( + Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) { while (idx < dim) { if (topK[beamSize - 1] < src[idx]) { Pair tmp(src[idx], idx); @@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, } } -template -__device__ __forceinline__ -void getTopK(Pair topK[], real *val, int *col, - int idx, int dim, int beamSize) { +template +__device__ __forceinline__ void getTopK( + Pair topK[], real* val, int* col, int idx, int dim, int beamSize) { while (idx < dim) { if (topK[beamSize - 1] < val[idx]) { Pair tmp(val[idx], col[idx]); @@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col, } } -template -__device__ __forceinline__ -void getTopK(Pair topK[], real *val, int *col, int idx, int dim, - const Pair& max, int beamSize) { +template +__device__ __forceinline__ void getTopK(Pair topK[], + real* val, + int* col, + int idx, + int dim, + const Pair& max, + int beamSize) { while (idx < dim) { if (topK[beamSize - 1] < val[idx]) { Pair tmp(val[idx], col[idx]); @@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim, } } -template -__device__ __forceinline__ -void threadGetTopK(Pair topK[], int& beam, int beamSize, - real* src, - bool& firstStep, bool& isEmpty, Pair& max, - int dim, const int tid) { +template +__device__ __forceinline__ void threadGetTopK(Pair topK[], + int& beam, + int beamSize, + real* src, + bool& firstStep, + bool& isEmpty, + Pair& max, + int dim, + const int tid) { if (beam > 0) { int length = beam < beamSize ? beam : beamSize; if (firstStep) { @@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, } } if (!isEmpty) { - getTopK(topK + maxLength - beam, src, tid, dim, - max, length); + getTopK(topK + maxLength - beam, src, tid, dim, max, length); } } @@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, } } -template -__device__ __forceinline__ -void threadGetTopK(Pair topK[], int& beam, int beamSize, - real* val, int* col, - bool& firstStep, bool& isEmpty, Pair& max, - int dim, const int tid) { +template +__device__ __forceinline__ void threadGetTopK(Pair topK[], + int& beam, + int beamSize, + real* val, + int* col, + bool& firstStep, + bool& isEmpty, + Pair& max, + int dim, + const int tid) { if (beam > 0) { int length = beam < beamSize ? beam : beamSize; if (firstStep) { @@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, } } if (!isEmpty) { - getTopK(topK + maxLength - beam, val, col, tid, dim, - max, length); + getTopK( + topK + maxLength - beam, val, col, tid, dim, max, length); } } @@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize, } } -template -__device__ __forceinline__ -void blockReduce(Pair* shTopK, int* maxId, Pair topK[], - real** topVal, int** topIds, - int& beam, int& beamSize, - const int tid, const int warp) { +template +__device__ __forceinline__ void blockReduce(Pair* shTopK, + int* maxId, + Pair topK[], + real** topVal, + int** topIds, + int& beam, + int& beamSize, + const int tid, + const int warp) { while (true) { __syncthreads(); if (tid < blockSize / 2) { @@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[], } } __syncthreads(); - for (int stride = blockSize / 4; stride > 0; stride = stride/2) { + for (int stride = blockSize / 4; stride > 0; stride = stride / 2) { if (tid < stride) { if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) { maxId[tid] = maxId[tid + stride]; @@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[], * 3. go to the second setp, until one thread's topK value is null; * 4. go to the first setp, until get the topK value. */ -template -__global__ void KeMatrixTopK(real* topVal, int ldv, - int * topIds, - real* src, int lds, +template +__global__ void KeMatrixTopK(real* topVal, + int ldv, + int* topIds, + real* src, + int lds, int dim, int beamSize) { __shared__ Pair shTopK[blockSize]; @@ -271,7 +279,7 @@ __global__ void KeMatrixTopK(real* topVal, int ldv, topVal += blockIdx.x * ldv; topIds += blockIdx.x * beamSize; - Pair topK[maxLength]; // NOLINT + Pair topK[maxLength]; // NOLINT int beam = maxLength; Pair max; bool isEmpty = false; @@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv, topK[k].set(-HL_FLOAT_MAX, -1); } while (beamSize) { - threadGetTopK - (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); + threadGetTopK( + topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); shTopK[tid] = topK[0]; - blockReduce - (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); + blockReduce( + shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); } } -template -__global__ void KeSMatrixTopK(real* topVal, int ldv, - int * topIds, +template +__global__ void KeSMatrixTopK(real* topVal, + int ldv, + int* topIds, real* val, int* row, int* col, @@ -304,7 +313,7 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv, topVal += blockIdx.x * ldv; topIds += blockIdx.x * beamSize; - Pair topK[maxLength]; // NOLINT + Pair topK[maxLength]; // NOLINT int beam = maxLength; Pair max; bool isEmpty = false; @@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv, topK[k].set(-HL_FLOAT_MAX, -1); } while (beamSize) { - threadGetTopK - (topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid); + threadGetTopK( + topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid); shTopK[tid] = topK[0]; - blockReduce - (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); + blockReduce( + shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); } } -void hl_matrix_top_k(real* topVal, int ldv, - int * topIds, - real* src, int lds, +void hl_matrix_top_k(real* topVal, + int ldv, + int* topIds, + real* src, + int lds, int dim, int beamSize, int numSamples) { @@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv, dim3 threads(256, 1); dim3 grid(numSamples, 1); - KeMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>> - (topVal, ldv, topIds, src, lds, dim, beamSize); + KeMatrixTopK<5, 256><<>>( + topVal, ldv, topIds, src, lds, dim, beamSize); CHECK_SYNC("hl_matrix_top_k failed"); } -void hl_sparse_matrix_top_k(real* topVal, int ldv, - int * topIds, +void hl_sparse_matrix_top_k(real* topVal, + int ldv, + int* topIds, hl_sparse_matrix_s src, int beamSize, int numSamples) { CHECK_NOTNULL(topVal); CHECK_NOTNULL(topIds); CHECK_NOTNULL(src); - CHECK_EQ(src->format, HL_SPARSE_CSR) - <<"sparse matrix format error!"; + CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!"; hl_csr_matrix csr = (hl_csr_matrix)src->matrix; - if (csr->csr_val == NULL || csr->csr_row == NULL || - csr->csr_col == NULL) { + if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) { LOG(FATAL) << "parameter src is null!"; } dim3 threads(256, 1); dim3 grid(numSamples, 1); - KeSMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>> - (topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize); + KeSMatrixTopK<5, 256><<>>( + topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize); CHECK_SYNC("hl_sparse_matrix_top_k failed"); } @@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv, * 3. go to the second setp, until one thread's topK value is null; * 4. go to the first setp, until get the topK value. */ -template -__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, - int * topIds, - real* src, int lds, +template +__global__ void KeMatrixTopKClassificationError(real* topVal, + int ldv, + int* topIds, + real* src, + int lds, int dim, int beamSize, int* label, @@ -408,7 +420,7 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, topVal += blockIdx.x * ldv; topIds += blockIdx.x * beamSize; - Pair topK[maxLength]; // NOLINT + Pair topK[maxLength]; // NOLINT int beam = maxLength; Pair max; bool isEmpty = false; @@ -420,34 +432,36 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv, } while (beamSize) { - threadGetTopK - (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); + threadGetTopK( + topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid); shTopK[tid] = topK[0]; - blockReduce - (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); + blockReduce( + shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp); } __syncthreads(); if (tid == 0) { for (int i = 0; i < topkSize; i++) { - if (*--topIds == label[blockIdx.x]) { - recResult[blockIdx.x] = 0; - break; - } - recResult[blockIdx.x] = 1.0f; + if (*--topIds == label[blockIdx.x]) { + recResult[blockIdx.x] = 0; + break; + } + recResult[blockIdx.x] = 1.0f; } } } -void hl_matrix_classification_error(real* topVal, int ldv, - int* topIds, - real* src, int lds, - int dim, - int topkSize, - int numSamples, - int* label, - real* recResult) { +void hl_matrix_classification_error(real* topVal, + int ldv, + int* topIds, + real* src, + int lds, + int dim, + int topkSize, + int numSamples, + int* label, + real* recResult) { CHECK_NOTNULL(topVal); CHECK_NOTNULL(topIds); CHECK_NOTNULL(src); @@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv, dim3 threads(256, 1); dim3 grid(numSamples, 1); - KeMatrixTopKClassificationError<5, 256> - <<< grid, threads, 0, STREAM_DEFAULT >>> - (topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult); + KeMatrixTopKClassificationError<5, 256><<>>( + topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult); CHECK_SYNC("hl_matrix_top_k classification error failed"); } diff --git a/paddle/framework/attr_type.proto b/paddle/framework/attr_type.proto index 2d8e0476d710b7ba987d085d828ca13a4ee23707..13ae312c10e934566384b8bd0f41dacd6c01fc2f 100644 --- a/paddle/framework/attr_type.proto +++ b/paddle/framework/attr_type.proto @@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -syntax="proto2"; +syntax = "proto2"; package paddle.framework; // Attribute Type for paddle's Op. // Op contains many attributes. Each type of attributes could be different. // The AttrType will be shared between AttrDesc and AttrProto. enum AttrType { - INT = 0; - FLOAT = 1; - STRING = 2; - INTS = 3; - FLOATS = 4; - STRINGS = 5; + INT = 0; + FLOAT = 1; + STRING = 2; + INTS = 3; + FLOATS = 4; + STRINGS = 5; } \ No newline at end of file diff --git a/paddle/framework/op_desc.proto b/paddle/framework/op_desc.proto index 89497f3c16bc28aa93b25a83c1f2eccafdf1c5b4..ddde1f7af371f88c49a20d6246c6fa025f23c28e 100644 --- a/paddle/framework/op_desc.proto +++ b/paddle/framework/op_desc.proto @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -syntax="proto2"; +syntax = "proto2"; package paddle.framework; import "attr_type.proto"; @@ -22,14 +22,14 @@ import "attr_type.proto"; // // e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0 message AttrDesc { - required string name = 1; - required AttrType type = 2; - optional int32 i = 3; - optional float f = 4; - optional string s = 5; - repeated int32 ints = 6; - repeated float floats = 7; - repeated string strings = 8; + required string name = 1; + required AttrType type = 2; + optional int32 i = 3; + optional float f = 4; + optional string s = 5; + repeated int32 ints = 6; + repeated float floats = 7; + repeated string strings = 8; }; // Protocol Message to describe an Operator. @@ -42,15 +42,15 @@ message AttrDesc { // 3rd-party language can build this proto message and call // AddOp(const OpDesc& op_desc) of Paddle core to create an Operator. message OpDesc { - // input names of this Operator. - repeated string inputs = 1; + // input names of this Operator. + repeated string inputs = 1; - // output names of this Operator. - repeated string outputs = 2; + // output names of this Operator. + repeated string outputs = 2; - // type of this Operator, such as "add", "sub", "fc". - required string type = 3; + // type of this Operator, such as "add", "sub", "fc". + required string type = 3; - // Attributes of this Operator. e.g., scale=3.0 in cosine op. - repeated AttrDesc attrs = 4; + // Attributes of this Operator. e.g., scale=3.0 in cosine op. + repeated AttrDesc attrs = 4; }; \ No newline at end of file diff --git a/paddle/framework/op_proto.proto b/paddle/framework/op_proto.proto index 366c84e53dc29e41eefbaef0a6452e01c4fe37bd..bdf0958ffc837e2e8a4309bf0a2b061d037f86f7 100644 --- a/paddle/framework/op_proto.proto +++ b/paddle/framework/op_proto.proto @@ -15,10 +15,11 @@ limitations under the License. */ // Protocol Message for 3rd-party language binding. // // Paddle Python package will use `OpProto` to generate op creation methods. -// The op creation methods take user's input and generate `OpDesc` proto message, +// The op creation methods take user's input and generate `OpDesc` proto +// message, // then pass `OpDesc` to C++ side and create Op pointer. // -syntax="proto2"; +syntax = "proto2"; package paddle.framework; import "attr_type.proto"; @@ -26,89 +27,90 @@ import "attr_type.proto"; // Attribute protocol message for 3rd-party language binding. // It will store the Op support what attribute and what type. message AttrProto { - // Supported attribute name. e.g. `scale` for cosine op. - required string name = 1; + // Supported attribute name. e.g. `scale` for cosine op. + required string name = 1; - // Supported attribute type. - required AttrType type = 2; + // Supported attribute type. + required AttrType type = 2; - // Supported attribute comments. It helps 3rd-party language generate doc-string. - required string comment = 3; + // Supported attribute comments. It helps 3rd-party language generate + // doc-string. + required string comment = 3; - // If that attribute is generated, it means the Paddle third language - // binding has responsibility to fill that attribute. End-User should - // not set that attribute. - optional bool generated = 4 [default=false]; + // If that attribute is generated, it means the Paddle third language + // binding has responsibility to fill that attribute. End-User should + // not set that attribute. + optional bool generated = 4 [ default = false ]; } // Input or output message for 3rd-party language binding. // It contains parameter name and its comments. message VarProto { - // Input or output name in that op creation function. - // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names. - required string name = 1; - - // The comment for that input. It helps 3rd-party language generate doc-string. - required string comment = 2; - - // Is that input/output could be a list or not. - // If so, that Op should write a attributed named `input_format` or - // `output_format`. - // - // e.g. - // If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W` - // could be multiple, so the multiple of `X` and `W` is True, and OpDesc - // will hold a attribute of them. - // - // The Op desc of same fc could be - // { - // "type": "fc", - // "input": ["X1", "X2", "W1", "W2", "b"], - // "output": "fc.out", - // "attrs" : { - // "input_format": [0, 2, 4, 5] - // } - // } - // - optional bool multiple = 3 [default=false]; - - // It marks that output is a temporary output. That output is not used by - // user, but used by other op internally as input. If other op is not use - // that output, it could be optimized early. - // - // Attribute temporary_index will be set in OpDesc if there is some - // outputs are temporary. - // - // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"], - // attrs = { - // "temporary_index": [1] - // } - optional bool temporary = 4 [default=false]; - - // The gradient of operator can be ignored immediately - // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2 - // can be ignored for the future optimized on graph. - optional bool ignore_gradient = 6; + // Input or output name in that op creation function. + // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names. + required string name = 1; + + // The comment for that input. It helps 3rd-party language generate + // doc-string. + required string comment = 2; + + // Is that input/output could be a list or not. + // If so, that Op should write a attributed named `input_format` or + // `output_format`. + // + // e.g. + // If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W` + // could be multiple, so the multiple of `X` and `W` is True, and OpDesc + // will hold a attribute of them. + // + // The Op desc of same fc could be + // { + // "type": "fc", + // "input": ["X1", "X2", "W1", "W2", "b"], + // "output": "fc.out", + // "attrs" : { + // "input_format": [0, 2, 4, 5] + // } + // } + // + optional bool multiple = 3 [ default = false ]; + + // It marks that output is a temporary output. That output is not used by + // user, but used by other op internally as input. If other op is not use + // that output, it could be optimized early. + // + // Attribute temporary_index will be set in OpDesc if there is some + // outputs are temporary. + // + // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"], + // attrs = { + // "temporary_index": [1] + // } + optional bool temporary = 4 [ default = false ]; + + // The gradient of operator can be ignored immediately + // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2 + // can be ignored for the future optimized on graph. + optional bool ignore_gradient = 6; } // Op protocol message for 3rd-party language binding. // It contains all information for generating op creation method. message OpProto { - // The input information to generate op creation method. - repeated VarProto inputs = 1; + // The input information to generate op creation method. + repeated VarProto inputs = 1; - // The output information to generate op creation method. - repeated VarProto outputs = 2; + // The output information to generate op creation method. + repeated VarProto outputs = 2; - // The attribute information to generate op creation method. - repeated AttrProto attrs = 3; + // The attribute information to generate op creation method. + repeated AttrProto attrs = 3; - // The comments for that Op. It helps 3rd-party language generate - // doc-string. The whole documentation of that Op is generated by comment, - // inputs, outputs, attrs together. - required string comment = 4; - - // The type of that Op. - required string type = 5; + // The comments for that Op. It helps 3rd-party language generate + // doc-string. The whole documentation of that Op is generated by comment, + // inputs, outputs, attrs together. + required string comment = 4; + // The type of that Op. + required string type = 5; } diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu index 1a5b4042402df3081a493962a5e080d72b7f40b2..4492dea5d8a6f8580a13f3059401c87fa2164085 100644 --- a/paddle/function/ContextProjectionOpGpu.cu +++ b/paddle/function/ContextProjectionOpGpu.cu @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "ContextProjectionOp.h" +#include "hl_base.h" namespace paddle { @@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input, int block_size = blockDim.x; int sequenceId = blockIdx.x; int seq_start = sequence[sequenceId]; - int seq_end = sequence[sequenceId+1]; + int seq_end = sequence[sequenceId + 1]; real value = 0; int instances = seq_end - seq_start + context_length - 1; @@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input, } else if ((i + context_start) >= (seq_end - seq_start)) { if (padding) { value = - weight[(begin_pad + i + context_start - (seq_end - seq_start)) * - input_dim + idx]; + weight[(begin_pad + i + context_start - (seq_end - seq_start)) * + input_dim + + idx]; } else { continue; } @@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input, int outx = (i - context_length) < 0 ? i : (context_length - 1); int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1)); real* output_r = - output + outy * input_dim * context_length + outx * input_dim; + output + outy * input_dim * context_length + outx * input_dim; for (int j = outy; j < seq_end - seq_start; j++) { output_r[idx] += value; if (j - outy == outx) break; @@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input, dim3 grid(blocks_x, blocks_y); if (weight) { - KeContextProjectionForward<<< grid, threads, 0, STREAM_DEFAULT >>> - (input, sequence, weight, output, input_dim, - context_length, context_start, begin_pad); - } else { - KeContextProjectionForward<<< grid, threads, 0, STREAM_DEFAULT >>> - (input, sequence, weight, output, input_dim, - context_length, context_start, begin_pad); + KeContextProjectionForward<<>>( + input, + sequence, + weight, + output, + input_dim, + context_length, + context_start, + begin_pad); + } else { + KeContextProjectionForward<<>>( + input, + sequence, + weight, + output, + input_dim, + context_length, + context_start, + begin_pad); } CHECK_SYNC("hl_context_projection_forward failed"); } @@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad, int block_size = blockDim.x; int sequenceId = blockIdx.x; int seq_start = sequence[sequenceId]; - int seq_end = sequence[sequenceId+1]; + int seq_end = sequence[sequenceId + 1]; real value = 0; int instances = seq_end - seq_start + context_length - 1; @@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad, int outx = (i - context_length) < 0 ? i : (context_length - 1); int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1)); real* output_r = - out + outy * input_dim * context_length + outx * input_dim; + out + outy * input_dim * context_length + outx * input_dim; for (int j = outy; j < seq_end - seq_start; j++) { value += output_r[idx]; if (j - outy == outx) break; @@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad, int blocks_y = 1; dim3 threads(block_size, 1); dim3 grid(blocks_x, blocks_y); - KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>> - (out_grad, sequence, input_grad, input_dim, context_length, context_start); + KeContextProjectionBackwardData<<>>( + out_grad, sequence, input_grad, input_dim, context_length, context_start); CHECK_SYNC("hl_context_projection_backward_data failed"); } @@ -231,7 +244,7 @@ void ContextProjectionBackwardData(const GpuMatrix& out_grad, context_start); } -template +template __global__ void KeContextProjectionBackwardWeight(const real* out_grad, const int* sequence, real* w_grad, @@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, if (weight_idx < w_dim) { for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) { int seq_start = sequence[seqId]; - int seq_end = sequence[seqId+1]; - output_r = const_cast(out_grad) - + seq_start * w_dim * context_length; + int seq_end = sequence[seqId + 1]; + output_r = + const_cast(out_grad) + seq_start * w_dim * context_length; if (context_start < 0) { if (padId + context_start < 0) { instanceId = padId; } else { // begin_pad > 0; - instanceId = (padId - begin_pad) + - (seq_end - seq_start) - context_start; + instanceId = + (padId - begin_pad) + (seq_end - seq_start) - context_start; } } else { if (padId + (seq_end - seq_start) < context_start) { @@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, } } - int outx = (instanceId - context_length) < 0 ? - instanceId : (context_length - 1); - int outy = (instanceId - context_length) < 0 ? - 0 : (instanceId - (context_length - 1)); + int outx = + (instanceId - context_length) < 0 ? instanceId : (context_length - 1); + int outy = (instanceId - context_length) < 0 + ? 0 + : (instanceId - (context_length - 1)); output_r += outy * w_dim * context_length + outx * w_dim; for (int j = outy; j < seq_end - seq_start; j++) { value += output_r[weight_idx]; @@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad, } __syncthreads(); - for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) { + for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) { if (idy < stride) { sum_s[idy][idx] += sum_s[idy + stride][idx]; } @@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad, dim3 threads(threads_x, threads_y); dim3 grid(blocks_x, 1); - KeContextProjectionBackwardWeight<32, 32> - <<< grid, threads, 0, STREAM_DEFAULT >>> - (out_grad, sequence, w_grad, num_sequences, w_dim, - context_length, context_start, begin_pad); + KeContextProjectionBackwardWeight<32, + 32><<>>( + out_grad, + sequence, + w_grad, + num_sequences, + w_dim, + context_length, + context_start, + begin_pad); CHECK_SYNC("hl_context_projection_backward_weight failed"); } template <> -void ContextProjectionBackwardWeight( - const GpuMatrix& out_grad, - GpuMatrix& w_grad, - const GpuIVector& seq_vec, - size_t context_length, - int context_start, - size_t total_pad, - size_t begin_pad) { +void ContextProjectionBackwardWeight(const GpuMatrix& out_grad, + GpuMatrix& w_grad, + const GpuIVector& seq_vec, + size_t context_length, + int context_start, + size_t total_pad, + size_t begin_pad) { hl_context_projection_backward_weight(out_grad.getData(), seq_vec.getData(), w_grad.getData(), @@ -376,23 +395,18 @@ void ContextProjectionBackward(const GpuMatrix& out_grad, size_t begin_pad, bool is_padding, size_t total_pad) { - if (in_grad) { - ContextProjectionBackwardData( - out_grad, - in_grad, - sequence, - context_length, - context_start); - } - if (is_padding && w_grad) { - ContextProjectionBackwardWeight( - out_grad, - w_grad, - sequence, - context_length, - context_start, - total_pad, - begin_pad); + if (in_grad) { + ContextProjectionBackwardData( + out_grad, in_grad, sequence, context_length, context_start); + } + if (is_padding && w_grad) { + ContextProjectionBackwardWeight(out_grad, + w_grad, + sequence, + context_length, + context_start, + total_pad, + begin_pad); } } diff --git a/paddle/function/CosSimOpGpu.cu b/paddle/function/CosSimOpGpu.cu index c62ab39551f02288618244871ae31c6800df5b42..a1f88f479b5818e3864129a4dac723bceed76fcf 100644 --- a/paddle/function/CosSimOpGpu.cu +++ b/paddle/function/CosSimOpGpu.cu @@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "CosSimOp.h" #include "hl_base.h" #include "hl_device_functions.cuh" -#include "CosSimOp.h" namespace paddle { -template +template __global__ void KeCosSim(real* output, const real* input1, const real* input2, @@ -78,8 +78,8 @@ void hlCossim(real* output, dim3 threads(block_size, 1); dim3 grid(1, input1_height); - KeCosSim<<>> - (output, input1, input2, width, input1_height, input2_height, scale); + KeCosSim<<>>( + output, input1, input2, width, input1_height, input2_height, scale); CHECK_SYNC("hlCossim failed"); } @@ -99,7 +99,7 @@ void CosSimForward(GpuMatrix& out_mat, hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale); } -template +template __global__ void KeCosSimDerivative(const real* grad, const real* output, const real* prev_out_x, @@ -148,14 +148,13 @@ __global__ void KeCosSimDerivative(const real* grad, if (xy[0] == 0) { real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0])); for (int index = tid; index < width; index += block_size) { - prev_grad_x[index] += - scale * grad[ty] * prev_out_y[index] * reciprocal; + prev_grad_x[index] += scale * grad[ty] * prev_out_y[index] * reciprocal; if (input2_height > 1) { - prev_grad_y[index] += - scale * grad[ty] * prev_out_x[index] * reciprocal; + prev_grad_y[index] += scale * grad[ty] * prev_out_x[index] * reciprocal; } else { - paddle::paddleAtomicAdd(prev_grad_y + index, - scale * grad[ty] * prev_out_x[index] * reciprocal); + paddle::paddleAtomicAdd( + prev_grad_y + index, + scale * grad[ty] * prev_out_x[index] * reciprocal); } } } else { @@ -163,17 +162,18 @@ __global__ void KeCosSimDerivative(const real* grad, real reciprocalSquareSumX = 1.0 / xx[0]; real reciprocalSquareSumY = 1.0 / yy[0]; for (int index = tid; index < width; index += block_size) { - prev_grad_x[index] += output[ty] * grad[ty] * - (prev_out_y[index] * reciprocalXY - - prev_out_x[index] * reciprocalSquareSumX); + prev_grad_x[index] += + output[ty] * grad[ty] * (prev_out_y[index] * reciprocalXY - + prev_out_x[index] * reciprocalSquareSumX); if (input2_height > 1) { - prev_grad_y[index] += output[ty] * grad[ty] * - (prev_out_x[index] * reciprocalXY - - prev_out_y[index] * reciprocalSquareSumY); + prev_grad_y[index] += + output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY - + prev_out_y[index] * reciprocalSquareSumY); } else { - paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] * - (prev_out_x[index] * reciprocalXY - - prev_out_y[index] * reciprocalSquareSumY)); + paddle::paddleAtomicAdd( + prev_grad_y + index, + output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY - + prev_out_y[index] * reciprocalSquareSumY)); } } } @@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad, const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, input1_height); - KeCosSimDerivative<<>> - (grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width, - input1_height, input2_height, scale); + KeCosSimDerivative<<>>( + grad, + output, + prev_out_x, + prev_out_y, + prev_grad_x, + prev_grad_y, + width, + input1_height, + input2_height, + scale); CHECK_SYNC("hlCossimDerivate failed"); } @@ -214,9 +222,9 @@ void CosSimBackward(const GpuMatrix& out_grad, real scale) { CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() && in2_val.getData() && in1_grad.getData() && in2_grad.getData()); - CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ - && in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_) - << "Matrix types are not equally GPU"; + CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ && + in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_) + << "Matrix types are not equally GPU"; size_t dim = in1_val.getWidth(); const real* grad = out_grad.getData(); diff --git a/paddle/function/CropOpGpu.cu b/paddle/function/CropOpGpu.cu index 786eb268d45aadee0c1f6fcbbafc23173cf0bc77..241356a9ca0b673c86ff4c39594722211e2d224e 100644 --- a/paddle/function/CropOpGpu.cu +++ b/paddle/function/CropOpGpu.cu @@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "CropOp.h" +#include "hl_base.h" namespace paddle { -__global__ void KeCrop(real* outputs, const real* inputs, - int inC, int inH, int inW, - int cropC, int cropH, int cropW, - int outC, int outH, int outW, int nthreads) { +__global__ void KeCrop(real* outputs, + const real* inputs, + int inC, + int inH, + int inW, + int cropC, + int cropH, + int cropW, + int outC, + int outH, + int outW, + int nthreads) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < nthreads) { const int w = idx % outW; @@ -35,12 +43,12 @@ __global__ void KeCrop(real* outputs, const real* inputs, template <> void Crop(real* outputs, - const real* inputs, - const TensorShape inShape, - const TensorShape outShape, - const FuncConfig& conf) { + const real* inputs, + const TensorShape inShape, + const TensorShape outShape, + const FuncConfig& conf) { std::vector crop_corner = - conf.get>("crop_corner"); + conf.get>("crop_corner"); int cropC = crop_corner[1]; int cropH = crop_corner[2]; int cropW = crop_corner[3]; @@ -58,16 +66,33 @@ void Crop(real* outputs, int blockSize = 1024; int gridSize = (nth + blockSize - 1) / blockSize; - KeCrop<<>> - (outputs, inputs, inC, inH, inW, cropC, cropH, cropW, - outC, outH, outW, nth); + KeCrop<<>>(outputs, + inputs, + inC, + inH, + inW, + cropC, + cropH, + cropW, + outC, + outH, + outW, + nth); CHECK_SYNC("Crop"); } -__global__ void KeCropDiff(const real* inGrad, real* outGrad, - int inC, int inH, int inW, - int cropC, int cropH, int cropW, - int outC, int outH, int outW, int nthreads) { +__global__ void KeCropDiff(const real* inGrad, + real* outGrad, + int inC, + int inH, + int inW, + int cropC, + int cropH, + int cropW, + int outC, + int outH, + int outW, + int nthreads) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < nthreads) { const int w = idx % inW; @@ -84,12 +109,12 @@ __global__ void KeCropDiff(const real* inGrad, real* outGrad, template <> void CropGrad(const real* inGrad, - real* outGrad, - const TensorShape inShape, - const TensorShape outShape, - const FuncConfig& conf) { + real* outGrad, + const TensorShape inShape, + const TensorShape outShape, + const FuncConfig& conf) { std::vector crop_corner = - conf.get>("crop_corner"); + conf.get>("crop_corner"); int cropC = crop_corner[1]; int cropH = crop_corner[2]; int cropW = crop_corner[3]; @@ -107,9 +132,18 @@ void CropGrad(const real* inGrad, int blockSize = 1024; int gridSize = (nth + blockSize - 1) / blockSize; - KeCropDiff <<>> - (inGrad, outGrad, inC, inH, inW, cropC, cropH, cropW, - outC, outH, outW, nth); + KeCropDiff<<>>(inGrad, + outGrad, + inC, + inH, + inW, + cropC, + cropH, + cropW, + outC, + outH, + outW, + nth); CHECK_SYNC("CropGrad"); } diff --git a/paddle/function/CrossMapNormalOpGpu.cu b/paddle/function/CrossMapNormalOpGpu.cu index b33dd108348b7789c6e73bfe3b1ffbc448163ef7..88b991ff6a1f028b333e82e2801ed2e9251aa36d 100644 --- a/paddle/function/CrossMapNormalOpGpu.cu +++ b/paddle/function/CrossMapNormalOpGpu.cu @@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "CrossMapNormalOp.h" +#include "hl_base.h" namespace paddle { -__global__ void KeCMRNormFillScale(size_t imageSize, const real* in, - real* scale, size_t channels, - size_t height, size_t width, size_t size, +__global__ void KeCMRNormFillScale(size_t imageSize, + const real* in, + real* scale, + size_t channels, + size_t height, + size_t width, + size_t size, real alpha) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < imageSize) { @@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in, } } -__global__ void KeCMRNormOutput(size_t inputSize, const real* in, - const real* scale, real negative_beta, +__global__ void KeCMRNormOutput(size_t inputSize, + const real* in, + const real* scale, + real negative_beta, real* out) { const int index = threadIdx.x + blockIdx.x * blockDim.x; if (index < inputSize) { @@ -74,24 +80,30 @@ void CrossMapNormal(real* outputs, size_t imageSize = numSamples * height * width; int blockSize = 1024; int gridSize = (imageSize + 1024 - 1) / 1024; - KeCMRNormFillScale<<>> - (imageSize, inputs, denoms, channels, height, width, size, scale); + KeCMRNormFillScale<<>>( + imageSize, inputs, denoms, channels, height, width, size, scale); - size_t inputSize = numSamples * height * width *channels; + size_t inputSize = numSamples * height * width * channels; blockSize = 1024; gridSize = (inputSize + 1024 - 1) / 1024; - KeCMRNormOutput<<>> - (inputSize, inputs, denoms, -pow, outputs); + KeCMRNormOutput<<>>( + inputSize, inputs, denoms, -pow, outputs); CHECK_SYNC("CrossMapNormal"); } -__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data, - const real* top_data, const real* scale, - const real* top_diff, size_t channels, - size_t height, size_t width, size_t size, - real negative_beta, real cache_ratio, - real* bottom_diff ) { +__global__ void KeCMRNormDiff(size_t imageSize, + const real* bottom_data, + const real* top_data, + const real* scale, + const real* top_diff, + size_t channels, + size_t height, + size_t width, + size_t size, + real negative_beta, + real cache_ratio, + real* bottom_diff) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < imageSize) { const int w = idx % width; @@ -113,17 +125,17 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data, while (index < channels + post_pad) { if (index < channels) { accum += top_diff[index * step] * top_data[index * step] / - scale[index * step]; + scale[index * step]; } if (index >= size) { accum -= top_diff[(index - size) * step] * - top_data[(index - size) * step] / scale[(index - size) * step]; + top_data[(index - size) * step] / scale[(index - size) * step]; } if (index >= post_pad) { bottom_diff[(index - post_pad) * step] += - top_diff[(index - post_pad) * step] * - pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio * - bottom_data[(index - post_pad) * step] * accum; + top_diff[(index - post_pad) * step] * + pow(scale[(index - post_pad) * step], negative_beta) - + cache_ratio * bottom_data[(index - post_pad) * step] * accum; } ++index; } @@ -147,9 +159,18 @@ void CrossMapNormalGrad(real* inputsGrad, int blockSize = 1024; int gridSize = (imageSize + 1024 - 1) / 1024; - KeCMRNormDiff <<>> - (imageSize, inputsValue, outputsValue, denoms, outputsGrad, channels, - height, width, size, -pow, 2.0f * pow * scale, inputsGrad); + KeCMRNormDiff<<>>(imageSize, + inputsValue, + outputsValue, + denoms, + outputsGrad, + channels, + height, + width, + size, + -pow, + 2.0f * pow * scale, + inputsGrad); CHECK_SYNC("CrossMapNormalGrad"); } diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu index ede0d27aa82e7d71ff5bc33df110fec260e06463..33463805cbd4746c05548028e0bc4a0e2a90453e 100644 --- a/paddle/function/DepthwiseConvOpGpu.cu +++ b/paddle/function/DepthwiseConvOpGpu.cu @@ -20,17 +20,25 @@ namespace paddle { // CUDA kernel to compute the depthwise convolution forward pass template -__global__ -void ConvolutionDepthwiseForward(const int nthreads, - const T* const inputData, const T* const filterData, - const int batchSize, const int outputChannels, const int outputHeight, - const int outputWidth, const int inputChannels, const int inputHeight, - const int inputWidth, const int filterMultiplier, const int filterHeight, - const int filterWidth, const int strideH, const int strideW, - const int paddingH, const int paddingW, T* const outputData) { - - int index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; +__global__ void ConvolutionDepthwiseForward(const int nthreads, + const T* const inputData, + const T* const filterData, + const int batchSize, + const int outputChannels, + const int outputHeight, + const int outputWidth, + const int inputChannels, + const int inputHeight, + const int inputWidth, + const int filterMultiplier, + const int filterHeight, + const int filterWidth, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + T* const outputData) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < nthreads) { const int batch = index / outputChannels / outputHeight / outputWidth; @@ -45,32 +53,36 @@ void ConvolutionDepthwiseForward(const int nthreads, const int w_in_start = -paddingW + w_out * strideW; const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1; const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1; - if ((h_in_start >= 0) && (h_in_end < inputHeight) - && (w_in_start >= 0) && (w_in_end < inputWidth)) { - for (int kh = 0; kh < filterHeight; ++kh) { - for (int kw = 0; kw < filterWidth; ++kw) { - const int h_in = -paddingH + h_out * strideH + kh; - const int w_in = -paddingW + w_out * strideW + kw; - const int offset = ((batch * inputChannels + c_in) - * inputHeight + h_in) * inputWidth + w_in; - value += (*weight) * inputData[offset]; - ++weight; - } + if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) && + (w_in_end < inputWidth)) { + for (int kh = 0; kh < filterHeight; ++kh) { + for (int kw = 0; kw < filterWidth; ++kw) { + const int h_in = -paddingH + h_out * strideH + kh; + const int w_in = -paddingW + w_out * strideW + kw; + const int offset = + ((batch * inputChannels + c_in) * inputHeight + h_in) * + inputWidth + + w_in; + value += (*weight) * inputData[offset]; + ++weight; } + } } else { - for (int kh = 0; kh < filterHeight; ++kh) { - for (int kw = 0; kw < filterWidth; ++kw) { - const int h_in = -paddingH + h_out * strideH + kh; - const int w_in = -paddingW + w_out * strideW + kw; - if ((h_in >= 0) && (h_in < inputHeight) - && (w_in >= 0) && (w_in < inputWidth)) { - const int offset = ((batch * inputChannels + c_in) - * inputHeight + h_in) * inputWidth + w_in; - value += (*weight) * inputData[offset]; - } - ++weight; - } - } + for (int kh = 0; kh < filterHeight; ++kh) { + for (int kw = 0; kw < filterWidth; ++kw) { + const int h_in = -paddingH + h_out * strideH + kh; + const int w_in = -paddingW + w_out * strideW + kw; + if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) && + (w_in < inputWidth)) { + const int offset = + ((batch * inputChannels + c_in) * inputHeight + h_in) * + inputWidth + + w_in; + value += (*weight) * inputData[offset]; + } + ++weight; + } + } } outputData[index] = value; } @@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads, // CUDA kernel to compute the depthwise convolution backprop w.r.t input. template -__global__ -void ConvolutionDepthwiseInputBackward(const int nthreads, - const T* const top_diff, const T* const weight_data, - const int num, const int outputChannels, const int outputHeight, - const int outputWidth, const int inputChannels, const int inputHeight, - const int inputWidth, const int filterMultiplier, const int filterHeight, - const int filterWidth, const int strideH, const int strideW, - const int paddingH, const int paddingW, T* const bottom_diff) { - int index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; +__global__ void ConvolutionDepthwiseInputBackward(const int nthreads, + const T* const top_diff, + const T* const weight_data, + const int num, + const int outputChannels, + const int outputHeight, + const int outputWidth, + const int inputChannels, + const int inputHeight, + const int inputWidth, + const int filterMultiplier, + const int filterHeight, + const int filterWidth, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + T* const bottom_diff) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < nthreads) { const int batch = index / inputChannels / inputHeight / inputWidth; const int c_in = (index / inputHeight / inputWidth) % inputChannels; @@ -96,65 +117,80 @@ void ConvolutionDepthwiseInputBackward(const int nthreads, const int c_out_start = c_in * filterMultiplier; - int h_out_start = (h_in - filterHeight + paddingH + strideH)/strideH; + int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH; h_out_start = 0 > h_out_start ? 0 : h_out_start; - int h_out_end = (h_in + paddingH)/strideH; - h_out_end = outputHeight - 1 < h_out_end? outputHeight - 1 : h_out_end; - int w_out_start = (w_in - filterWidth + paddingW + strideW)/strideW; + int h_out_end = (h_in + paddingH) / strideH; + h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end; + int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW; w_out_start = 0 > w_out_start ? 0 : w_out_start; - int w_out_end = (w_in + paddingW)/strideW; - w_out_end = outputWidth - 1 < w_out_end? outputWidth - 1 : w_out_end; + int w_out_end = (w_in + paddingW) / strideW; + w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end; T value = 0; - for (int c_out = c_out_start; - c_out < c_out_start + filterMultiplier; c_out ++) { - for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { - const int filter_h = h_in + paddingH - h_out * strideH; - for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { - const int filter_w = w_in + paddingW - w_out * strideW; - const int filter_offset = c_out * filterHeight * filterWidth - + filter_h * filterWidth + filter_w; - const int top_diff_offset = ((batch * outputChannels + c_out) * - outputHeight + h_out)* outputWidth + w_out; - value += top_diff[top_diff_offset] * weight_data[filter_offset]; - } + for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier; + c_out++) { + for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { + const int filter_h = h_in + paddingH - h_out * strideH; + for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { + const int filter_w = w_in + paddingW - w_out * strideW; + const int filter_offset = c_out * filterHeight * filterWidth + + filter_h * filterWidth + filter_w; + const int top_diff_offset = + ((batch * outputChannels + c_out) * outputHeight + h_out) * + outputWidth + + w_out; + value += top_diff[top_diff_offset] * weight_data[filter_offset]; } + } } bottom_diff[index] += value; - } + } } // CUDA kernel to compute the depthwise convolution backprop w.r.t filter. template -__global__ -void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads, - const T* const top_diff, const T* const inputData, - const int num, const int outputChannels, const int outputHeight, - const int outputWidth, const int inputChannels, const int inputHeight, - const int inputWidth, const int filterMultiplier, const int filterHeight, - const int filterWidth, const int strideH, const int strideW, - const int paddingH, const int paddingW, T* const buffer_data) { - int index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; +__global__ void ConvolutionDepthwiseFilterBackward(const int num_i, + const int nthreads, + const T* const top_diff, + const T* const inputData, + const int num, + const int outputChannels, + const int outputHeight, + const int outputWidth, + const int inputChannels, + const int inputHeight, + const int inputWidth, + const int filterMultiplier, + const int filterHeight, + const int filterWidth, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + T* const buffer_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < nthreads) { const int h_out = (index / outputWidth) % outputHeight; const int w_out = index % outputWidth; - const int kh = (index / filterWidth / outputHeight / outputWidth) - % filterHeight; + const int kh = + (index / filterWidth / outputHeight / outputWidth) % filterHeight; const int kw = (index / outputHeight / outputWidth) % filterWidth; const int h_in = -paddingH + h_out * strideH + kh; const int w_in = -paddingW + w_out * strideW + kw; - if ((h_in >= 0) && (h_in < inputHeight) - && (w_in >= 0) && (w_in < inputWidth)) { - const int c_out = index / - (filterHeight * filterWidth * outputHeight * outputWidth); + if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) && + (w_in < inputWidth)) { + const int c_out = + index / (filterHeight * filterWidth * outputHeight * outputWidth); const int c_in = c_out / filterMultiplier; const int batch = num_i; - const int top_offset = ((batch * outputChannels + c_out) * - outputHeight + h_out) * outputWidth + w_out; - const int bottom_offset = ((batch * inputChannels + c_in) - * inputHeight + h_in) * inputWidth + w_in; + const int top_offset = + ((batch * outputChannels + c_out) * outputHeight + h_out) * + outputWidth + + w_out; + const int bottom_offset = + ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth + + w_in; buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset]; } else { buffer_data[index] = 0; @@ -163,170 +199,169 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads, } template -class DepthwiseConvFunctor{ +class DepthwiseConvFunctor { public: void operator()(const T* inputData, - const T* filterData, - int batchSize, - int outputChannels, - int outputHeight, - int outputWidth, - int inputChannels, - int inputHeight, - int inputWidth, - int filterMultiplier, - int filterHeight, - int filterWidth, - int strideH, - int strideW, - int paddingH, - int paddingW, - T* outputData){ + const T* filterData, + int batchSize, + int outputChannels, + int outputHeight, + int outputWidth, + int inputChannels, + int inputHeight, + int inputWidth, + int filterMultiplier, + int filterHeight, + int filterWidth, + int strideH, + int strideW, + int paddingH, + int paddingW, + T* outputData) { int outputSize = batchSize * outputChannels * outputHeight * outputWidth; - size_t blocks = (outputSize + 1024 -1) / 1024; + size_t blocks = (outputSize + 1024 - 1) / 1024; size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; + size_t blockY = (blocks + 512 - 1) / 512; dim3 threads(1024, 1); dim3 grid(blockX, blockY); - ConvolutionDepthwiseForward - <<< grid, threads, 0, STREAM_DEFAULT >>>( - outputSize, - inputData, - filterData, - batchSize, - outputChannels, - outputHeight, - outputWidth, - inputChannels, - inputHeight, - inputWidth, - filterMultiplier, - filterHeight, - filterWidth, - strideH, - strideW, - paddingH, - paddingW, - outputData); - } + ConvolutionDepthwiseForward<<>>( + outputSize, + inputData, + filterData, + batchSize, + outputChannels, + outputHeight, + outputWidth, + inputChannels, + inputHeight, + inputWidth, + filterMultiplier, + filterHeight, + filterWidth, + strideH, + strideW, + paddingH, + paddingW, + outputData); + } }; template -class DepthwiseConvGradInputFunctor{ +class DepthwiseConvGradInputFunctor { public: void operator()(const T* outputGrad, - const T* filterData, - int batchSize, - int outputChannels, - int outputHeight, - int outputWidth, - int inputChannels, - int inputHeight, - int inputWidth, - int filterMultiplier, - int filterHeight, - int filterWidth, - int strideH, - int strideW, - int paddingH, - int paddingW, - T* inputGrad){ + const T* filterData, + int batchSize, + int outputChannels, + int outputHeight, + int outputWidth, + int inputChannels, + int inputHeight, + int inputWidth, + int filterMultiplier, + int filterHeight, + int filterWidth, + int strideH, + int strideW, + int paddingH, + int paddingW, + T* inputGrad) { int inputSize = batchSize * inputChannels * inputHeight * inputWidth; - size_t blocks = (inputSize + 1024 -1) / 1024; + size_t blocks = (inputSize + 1024 - 1) / 1024; size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; + size_t blockY = (blocks + 512 - 1) / 512; dim3 threads(1024, 1); dim3 grid(blockX, blockY); - ConvolutionDepthwiseInputBackward - // NOLINT_NEXT_LINE(whitespace/operators) - <<< grid, threads, 0, STREAM_DEFAULT >>>( - inputSize, - outputGrad, - filterData, - batchSize, - outputChannels, - outputHeight, - outputWidth, - inputChannels, - inputHeight, - inputWidth, - filterMultiplier, - filterHeight, - filterWidth, - strideH, - strideW, - paddingH, - paddingW, - inputGrad); - } + // NOLINT_NEXT_LINE(whitespace/operators) + <<>>(inputSize, + outputGrad, + filterData, + batchSize, + outputChannels, + outputHeight, + outputWidth, + inputChannels, + inputHeight, + inputWidth, + filterMultiplier, + filterHeight, + filterWidth, + strideH, + strideW, + paddingH, + paddingW, + inputGrad); + } }; template class DepthwiseConvGradFilterFunctor { public: void operator()(const T* outputGrad, - const T* inputData, - int batchSize, - int outputChannels, - int outputHeight, - int outputWidth, - int inputChannels, - int inputHeight, - int inputWidth, - int filterMultiplier, - int filterHeight, - int filterWidth, - int strideH, - int strideW, - int paddingH, - int paddingW, - T* colData, - T* filterGrad){ - int colDataSize = outputChannels * filterHeight * filterWidth - * outputHeight * outputWidth; + const T* inputData, + int batchSize, + int outputChannels, + int outputHeight, + int outputWidth, + int inputChannels, + int inputHeight, + int inputWidth, + int filterMultiplier, + int filterHeight, + int filterWidth, + int strideH, + int strideW, + int paddingH, + int paddingW, + T* colData, + T* filterGrad) { + int colDataSize = outputChannels * filterHeight * filterWidth * + outputHeight * outputWidth; - size_t blocks = (colDataSize + 1024 -1) / 1024; - size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; - dim3 threads(1024, 1); - dim3 grid(blockX, blockY); - BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth, - 1, filterGrad, false, true); + size_t blocks = (colDataSize + 1024 - 1) / 1024; + size_t blockX = 512; + size_t blockY = (blocks + 512 - 1) / 512; + dim3 threads(1024, 1); + dim3 grid(blockX, blockY); + BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth, + 1, + filterGrad, + false, + true); - for (int i = 0; i < batchSize; i++) { - ConvolutionDepthwiseFilterBackward - <<< grid, threads, 0, STREAM_DEFAULT >>>( - i, - colDataSize, - outputGrad, - inputData, - batchSize, - outputChannels, - outputHeight, - outputWidth, - inputChannels, - inputHeight, - inputWidth, - filterMultiplier, - filterHeight, - filterWidth, - strideH, - strideW, - paddingH, - paddingW, - colData); - int K = outputHeight * outputWidth; - int M = colDataSize / K; + for (int i = 0; i < batchSize; i++) { + ConvolutionDepthwiseFilterBackward< + T><<>>(i, + colDataSize, + outputGrad, + inputData, + batchSize, + outputChannels, + outputHeight, + outputWidth, + inputChannels, + inputHeight, + inputWidth, + filterMultiplier, + filterHeight, + filterWidth, + strideH, + strideW, + paddingH, + paddingW, + colData); + int K = outputHeight * outputWidth; + int M = colDataSize / K; - BaseMatrix colMatrix(M, K, colData, false, true); - filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0); - } + BaseMatrix colMatrix(M, K, colData, false, true); + filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0); } + } }; #ifdef PADDLE_TYPE_DOUBLE diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu index 15ba854009636d027447d104071163100d5e3f4b..bd98610498b1af003574129118be4684d38e5813 100644 --- a/paddle/function/Im2ColOpGpu.cu +++ b/paddle/function/Im2ColOpGpu.cu @@ -17,16 +17,21 @@ limitations under the License. */ namespace paddle { -template -__global__ -void im2col(const T* data_im, int numOuts, int height, int width, - int blockH, int blockW, - int strideH, int strideW, - int paddingH, int paddingW, - int height_col, int width_col, - T* data_col) { - int index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; +template +__global__ void im2col(const T* data_im, + int numOuts, + int height, + int width, + int blockH, + int blockW, + int strideH, + int strideW, + int paddingH, + int paddingW, + int height_col, + int width_col, + T* data_col) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < numOuts) { int w_out = index % width_col; index /= width_col; @@ -39,17 +44,17 @@ void im2col(const T* data_im, int numOuts, int height, int width, data_col += (channel_out * height_col + h_out) * width_col + w_out; for (int i = 0; i < blockH; ++i) { for (int j = 0; j < blockW; ++j) { - int rIdx = int(h_in+i); - int cIdx = int(w_in+j); - if ((rIdx-(int)paddingH) >= (int)height || - (rIdx-(int)paddingH) < 0 || - (cIdx-(int)paddingW) >= (int)width || - (cIdx-(int)paddingW) < 0) { + int rIdx = int(h_in + i); + int cIdx = int(w_in + j); + if ((rIdx - (int)paddingH) >= (int)height || + (rIdx - (int)paddingH) < 0 || + (cIdx - (int)paddingW) >= (int)width || + (cIdx - (int)paddingW) < 0) { *data_col = 0; } else { - rIdx = rIdx + channel_in*height - paddingH; + rIdx = rIdx + channel_in * height - paddingH; cIdx = cIdx - paddingW; - *data_col = data_im[rIdx* width + cIdx]; + *data_col = data_im[rIdx * width + cIdx]; } data_col += height_col * width_col; } @@ -82,60 +87,73 @@ public: int outputWidth = colShape[4]; int numKernels = inputChannels * outputHeight * outputWidth; - int blocks = (numKernels + 1024 -1) / 1024; + int blocks = (numKernels + 1024 - 1) / 1024; int blockX = 512; int blockY = (blocks + 512 - 1) / 512; dim3 threads(1024, 1); dim3 grid(blockX, blockY); - im2col<<< grid, threads, 0, STREAM_DEFAULT >>> - (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth, - strideHeight, strideWidth, paddingHeight, paddingWidth, - outputHeight, outputWidth, colData); + im2col<<>>(imData, + numKernels, + inputHeight, + inputWidth, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth, + colData); CHECK_SYNC("Im2ColFunctor GPU failed"); } }; -template -__global__ -void col2im(size_t n, const T* data_col, size_t height, - size_t width, size_t channels, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t height_col, size_t width_col, - T* data_im) { +template +__global__ void col2im(size_t n, + const T* data_col, + size_t height, + size_t width, + size_t channels, + size_t blockH, + size_t blockW, + size_t strideH, + size_t strideW, + size_t paddingH, + size_t paddingW, + size_t height_col, + size_t width_col, + T* data_im) { size_t index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < n) { T val = 0; int w = int(index % width); int h = int((index / width) % height); int c = int(index / (width * height)); if ((w - (int)paddingW) >= 0 && - (w - (int)paddingW) < (width-2 * paddingW) && - (h - (int)paddingH) >= 0 && - (h - paddingH) < (height - 2 * paddingH)) { + (w - (int)paddingW) < (width - 2 * paddingW) && + (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) { // compute the start and end of the output int w_col_start = - (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; - int w_col_end = - min((int)(w / (int)strideW + 1), (int)(width_col)); + (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; + int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col)); int h_col_start = - (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; + (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; int h_col_end = min(int(h / strideH + 1), int(height_col)); for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { // the col location: [c * width * height + h_out, w_out] - int c_col = int(c * blockH* blockW) + \ - (h - h_col * (int)strideH) * (int)blockW + - (w - w_col * (int)strideW); + int c_col = int(c * blockH * blockW) + + (h - h_col * (int)strideH) * (int)blockW + + (w - w_col * (int)strideW); val += data_col[(c_col * height_col + h_col) * width_col + w_col]; } } h -= paddingH; w -= paddingW; - data_im[c*((width-2*paddingW) * (height-2*paddingH)) + - h*(width-2*paddingW) + w] += val; + data_im[c * ((width - 2 * paddingW) * (height - 2 * paddingH)) + + h * (width - 2 * paddingW) + w] += val; } } } @@ -164,32 +182,32 @@ public: int outputHeight = colShape[3]; int outputWidth = colShape[4]; - size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight) - * (inputWidth + 2*paddingWidth); + size_t numKernels = inputChannels * (inputHeight + 2 * paddingHeight) * + (inputWidth + 2 * paddingWidth); - size_t blocks = (numKernels + 1024 -1) / 1024; + size_t blocks = (numKernels + 1024 - 1) / 1024; size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; + size_t blockY = (blocks + 512 - 1) / 512; dim3 threads(1024, 1); dim3 grid(blockX, blockY); // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. - col2im<<< grid, threads, 0, STREAM_DEFAULT >>> - (numKernels, - colData, - inputHeight + 2*paddingHeight, - inputWidth + 2*paddingWidth, - inputChannels, - filterHeight, - filterWidth, - strideHeight, - strideWidth, - paddingHeight, - paddingWidth, - outputHeight, - outputWidth, - imData); + col2im<<>>( + numKernels, + colData, + inputHeight + 2 * paddingHeight, + inputWidth + 2 * paddingWidth, + inputChannels, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth, + imData); CHECK_SYNC("Col2ImFunctor GPU failed"); } }; @@ -199,31 +217,35 @@ template class Im2ColFunctor; template class Col2ImFunctor; template class Col2ImFunctor; -template -__global__ -void im2colOCF(const T* imData, T* colData, - int inputChannels, - int inputHeight, int inputWidth, - int filterHeight, int filterWidth, - int strideHeight, int strideWidth, - int paddingHeight, int paddingWidth, - int outputHeight, int outputWidth) { +template +__global__ void im2colOCF(const T* imData, + T* colData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth) { int swId = blockIdx.x; int shId = blockIdx.y; - for (int channelId = threadIdx.z; - channelId < inputChannels; + for (int channelId = threadIdx.z; channelId < inputChannels; channelId += blockDim.z) { for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { int widthOffset = idx + swId * strideWidth - paddingWidth; int heightOffset = idy + shId * strideHeight - paddingHeight; - int imOffset = widthOffset + heightOffset * inputWidth - + channelId * inputHeight * inputWidth; + int imOffset = widthOffset + heightOffset * inputWidth + + channelId * inputHeight * inputWidth; - int colOffset = idx + idy * filterWidth - + channelId * filterHeight * filterWidth - + (shId * outputWidth + swId) - * (inputChannels * filterHeight * filterWidth); + int colOffset = idx + idy * filterWidth + + channelId * filterHeight * filterWidth + + (shId * outputWidth + swId) * + (inputChannels * filterHeight * filterWidth); if (heightOffset >= inputHeight || heightOffset < 0 || widthOffset >= inputWidth || widthOffset < 0) { @@ -279,39 +301,52 @@ public: int blockDimZ = 1024 / blockDimX / blockDimY; dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels)); dim3 grid(outputWidth, outputHeight); - im2colOCF<<< grid, threads, 0, STREAM_DEFAULT >>> - (imData, colData, inputChannels, inputHeight, inputWidth, - filterHeight, filterWidth, strideHeight, strideWidth, - paddingHeight, paddingWidth, outputHeight, outputWidth); + im2colOCF<<>>(imData, + colData, + inputChannels, + inputHeight, + inputWidth, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth); CHECK_SYNC("Im2ColFunctor GPU failed"); } }; -template -__global__ -void col2imOCF(T* imData, const T* colData, - int inputChannels, - int inputHeight, int inputWidth, - int filterHeight, int filterWidth, - int strideHeight, int strideWidth, - int paddingHeight, int paddingWidth, - int outputHeight, int outputWidth) { +template +__global__ void col2imOCF(T* imData, + const T* colData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth) { int swId = blockIdx.x; int shId = blockIdx.y; - for (int channelId = threadIdx.z; - channelId < inputChannels; + for (int channelId = threadIdx.z; channelId < inputChannels; channelId += blockDim.z) { for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { int widthOffset = idx + swId * strideWidth - paddingWidth; int heightOffset = idy + shId * strideHeight - paddingHeight; - int imOffset = widthOffset + heightOffset * inputWidth - + channelId * inputHeight * inputWidth; + int imOffset = widthOffset + heightOffset * inputWidth + + channelId * inputHeight * inputWidth; - int colOffset = idx + idy * filterWidth - + channelId * filterHeight * filterWidth - + (shId * outputWidth + swId) - * (inputChannels * filterHeight * filterWidth); + int colOffset = idx + idy * filterWidth + + channelId * filterHeight * filterWidth + + (shId * outputWidth + swId) * + (inputChannels * filterHeight * filterWidth); if (heightOffset >= 0 && heightOffset < inputHeight && widthOffset >= 0 && widthOffset < inputWidth) { @@ -365,10 +400,19 @@ public: int blockDimZ = 1024 / blockDimX / blockDimY; dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels)); dim3 grid(outputWidth, outputHeight); - col2imOCF<<< grid, threads, 0, STREAM_DEFAULT >>> - (imData, colData, inputChannels, inputHeight, inputWidth, - filterHeight, filterWidth, strideHeight, strideWidth, - paddingHeight, paddingWidth, outputHeight, outputWidth); + col2imOCF<<>>(imData, + colData, + inputChannels, + inputHeight, + inputWidth, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth); CHECK_SYNC("Col2ImFunctor GPU failed"); } }; diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu index dcfcb2325d7dae22e0e0e78fc0bddf061fc0940c..9449b89056b4b1740cb4c3de630348b1b361d61e 100644 --- a/paddle/function/MulOpGpu.cu +++ b/paddle/function/MulOpGpu.cu @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "MulOp.h" +#include "hl_base.h" #include "paddle/math/Matrix.h" #include "paddle/math/SparseMatrix.h" diff --git a/paddle/function/PadOpGpu.cu b/paddle/function/PadOpGpu.cu index 9094f1528433fdcaad3397a991aa8ac6fa04bc01..5b6f4e6832aea4bcfe22e530f5f25ef5815729f1 100644 --- a/paddle/function/PadOpGpu.cu +++ b/paddle/function/PadOpGpu.cu @@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "PadOp.h" +#include "hl_base.h" namespace paddle { -__global__ void KePad(real* outputs, const real* inputs, - int inC, int inH, int inW, - int padc, int padh, int padw, - int outC, int outH, int outW, int nthreads) { +__global__ void KePad(real* outputs, + const real* inputs, + int inC, + int inH, + int inW, + int padc, + int padh, + int padw, + int outC, + int outH, + int outW, + int nthreads) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < nthreads) { const int w = idx % inW; @@ -50,16 +58,33 @@ void Pad(real* outputs, int outC = inC + cstart + cend; int outH = inH + hstart + hend; int outW = inW + wstart + wend; - KePad<<>> - (outputs, inputs, inC, inH, inW, cstart, hstart, wstart, - outC, outH, outW, nth); + KePad<<>>(outputs, + inputs, + inC, + inH, + inW, + cstart, + hstart, + wstart, + outC, + outH, + outW, + nth); CHECK_SYNC("Pad"); } -__global__ void KePadDiff(real* inGrad, const real* outGrad, - int inC, int inH, int inW, - int padc, int padh, int padw, - int outC, int outH, int outW, int nthreads) { +__global__ void KePadDiff(real* inGrad, + const real* outGrad, + int inC, + int inH, + int inW, + int padc, + int padh, + int padw, + int outC, + int outH, + int outW, + int nthreads) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < nthreads) { const int w = idx % inW; @@ -89,9 +114,18 @@ void PadGrad(real* inGrad, int outC = inC + cstart + cend; int outH = inH + hstart + hend; int outW = inW + wstart + wend; - KePadDiff <<>> - (inGrad, outGrad, inC, inH, inW, cstart, hstart, wstart, - outC, outH, outW, nth); + KePadDiff<<>>(inGrad, + outGrad, + inC, + inH, + inW, + cstart, + hstart, + wstart, + outC, + outH, + outW, + nth); CHECK_SYNC("PadGrad"); } diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu index d9dcc7d59d1e3c222f5a7ce448daa8d7edb6c978..b0cbd9fd1df9a35d6cc1cb5312099d8b45197944 100644 --- a/paddle/function/RowConvOpGpu.cu +++ b/paddle/function/RowConvOpGpu.cu @@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_base.h" #include "RowConvOp.h" +#include "hl_base.h" namespace paddle { -template -__global__ void KeRowConv(real* y, const real* x, const real* w, - const int* starts, const int height, const int width, - const int numSeq, const int context) { - +template +__global__ void KeRowConv(real* y, + const real* x, + const real* w, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -30,7 +34,7 @@ __global__ void KeRowConv(real* y, const real* x, const real* w, __shared__ real sw[BLOCK_H][BLOCK_W]; for (int i = tidy; i < context; i += blky) { - sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0; + sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0; } __syncthreads(); @@ -56,9 +60,14 @@ __global__ void KeRowConv(real* y, const real* x, const real* w, } } -__global__ void KeRowConv2(real* y, const real* x, const real* w, - const int* starts, const int height, const int width, - const int numSeq, const int context) { +__global__ void KeRowConv2(real* y, + const real* x, + const real* w, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -84,8 +93,6 @@ __global__ void KeRowConv2(real* y, const real* x, const real* w, } } - - template <> void RowConv(GpuMatrix& out, const GpuMatrix& in, @@ -105,21 +112,24 @@ void RowConv(GpuMatrix& out, dim3 dimGrid(DIVUP(width, dimBlock.x), 1); if (contextLength <= 32) { - KeRowConv<32, 32><<>> - (y, x, w, starts, height, width, numSeq, contextLength); + KeRowConv<32, 32><<>>( + y, x, w, starts, height, width, numSeq, contextLength); } else { - KeRowConv2<<>> - (y, x, w, starts, height, width, numSeq, contextLength); + KeRowConv2<<>>( + y, x, w, starts, height, width, numSeq, contextLength); } CHECK_SYNC("RowConv"); } - -template -__global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, - const int* starts, const int height, const int width, const int numSeq, - const int context) { - +template +__global__ void KeRowConvBwWeight(real* dw, + const real* x, + const real* dy, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -138,21 +148,21 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, const int start = starts[i]; const int end = starts[i + 1]; const int steps = end - start; - const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H; + const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H; for (int j = tidy; j < size; j += BLOCK_H) { int xoff = gidx + tidx; int yoff = start + j; // transpose - sh_x[tidx][tidy] = (xoff < width && yoff < end) ? - x[yoff * width + xoff] : 0.0; - sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? - dy[yoff * width + xoff] : 0.0; + sh_x[tidx][tidy] = + (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0; + sh_dy[tidx][tidy + context - 1] = + (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0; __syncthreads(); if (tidy < (context - 1)) { yoff = yoff - context + 1; - sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? - dy[yoff * width + xoff] : 0.0; + sh_dy[tidx][tidy] = + (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0; } __syncthreads(); @@ -179,11 +189,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, } } -template -__global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, - const int* starts, const int height, const int width, const int numSeq, - const int context) { - +template +__global__ void KeRowConvBwWeight2(real* dw, + const real* x, + const real* dy, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int gidx = blockIdx.x * blockDim.x; @@ -196,19 +210,21 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, const int end = starts[i + 1]; const int steps = end - start; - const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H; + const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H; for (int j = tidy; j < size; j += BLOCK_H) { int xoff = gidx + tidx; int yoff = start + j; // transpose - sh_x[tidx][tidy] = (xoff < width && yoff < end) ? - x[yoff * width + xoff] : 0.0; + sh_x[tidx][tidy] = + (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0; __syncthreads(); for (int t = 0; t < context; t++) { - sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && - yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0; + sh_dy[tidx][tidy] = + (xoff < width && (yoff - t) >= start && yoff - t < end) + ? dy[(yoff - t) * width + xoff] + : 0.0; __syncthreads(); real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx]; @@ -222,18 +238,22 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, __syncthreads(); if (tidx == 0 && (gidx + tidy) < width) { - dw[t*width + gidx + tidy] += val; + dw[t * width + gidx + tidy] += val; } } } } } -template -__global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, - const int* starts, const int height, const int width, const int numSeq, - const int context) { - +template +__global__ void KeRowConvBwData(real* dx, + const real* w, + const real* dy, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -242,7 +262,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, __shared__ real sw[BLOCK_H][BLOCK_W]; for (int i = tidy; i < context; i += blky) { - sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0; + sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0; } __syncthreads(); @@ -266,10 +286,14 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, } } -__global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy, - const int* starts, const int height, const int width, const int numSeq, - const int context) { - +__global__ void KeRowConvBwData2(real* dx, + const real* w, + const real* dy, + const int* starts, + const int height, + const int width, + const int numSeq, + const int context) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int blky = blockDim.y; @@ -295,14 +319,13 @@ __global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy, } } - template <> void RowConvGrad(const GpuMatrix& outG, - const GpuMatrix& in, - const GpuMatrix& filter, - GpuMatrix& inG, - GpuMatrix& filterG, - const GpuIVector& seq) { + const GpuMatrix& in, + const GpuMatrix& filter, + GpuMatrix& inG, + GpuMatrix& filterG, + const GpuIVector& seq) { const size_t numSeq = seq.getSize() - 1; const size_t contextLength = filter.getHeight(); const size_t height = in.getHeight(); @@ -318,13 +341,11 @@ void RowConvGrad(const GpuMatrix& outG, dim3 dimGrid(DIVUP(width, dimBlock.x), 1); real* dw = filterG.getData(); if (contextLength <= 32) { - KeRowConvBwWeight<32, 32, 32> - <<>> - (dw, x, dy, starts, height, width, numSeq, contextLength); + KeRowConvBwWeight<32, 32, 32><<>>( + dw, x, dy, starts, height, width, numSeq, contextLength); } else { - KeRowConvBwWeight2<32, 32> - <<>> - (dw, x, dy, starts, height, width, numSeq, contextLength); + KeRowConvBwWeight2<32, 32><<>>( + dw, x, dy, starts, height, width, numSeq, contextLength); } } @@ -333,13 +354,11 @@ void RowConvGrad(const GpuMatrix& outG, dim3 dimBlock2(32, 32); dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1); if (contextLength <= 64) { - KeRowConvBwData<32, 64> - <<>> - (dx, w, dy, starts, height, width, numSeq, contextLength); + KeRowConvBwData<32, 64><<>>( + dx, w, dy, starts, height, width, numSeq, contextLength); } else { - KeRowConvBwData2 - <<>> - (dx, w, dy, starts, height, width, numSeq, contextLength); + KeRowConvBwData2<<>>( + dx, w, dy, starts, height, width, numSeq, contextLength); } } diff --git a/paddle/gserver/layers/GruCompute.cu b/paddle/gserver/layers/GruCompute.cu index d5e547dce347c824f959425551afea66dfd94e5a..b4f5c54b14767586cb7b7e2c86cc069e2063ccfd 100644 --- a/paddle/gserver/layers/GruCompute.cu +++ b/paddle/gserver/layers/GruCompute.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "GruCompute.h" #include "hl_recurrent_apply.cuh" @@ -31,8 +30,10 @@ void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) { } template <> -void GruCompute::backward<1>(hl_gru_value value, hl_gru_grad grad, - int frameSize, int batchSize) { +void GruCompute::backward<1>(hl_gru_value value, + hl_gru_grad grad, + int frameSize, + int batchSize) { hl_gpu_gru_backward(hppl::backward::gru_stateGrad(), hppl::backward::gru_resetGrad(), value, diff --git a/paddle/gserver/layers/LstmCompute.cu b/paddle/gserver/layers/LstmCompute.cu index f75c0c40ccc833e35f8fe8f21c12b3d3f68d5eb6..d3f59b52a4b3163f47a969d9a08ecd139a099e33 100644 --- a/paddle/gserver/layers/LstmCompute.cu +++ b/paddle/gserver/layers/LstmCompute.cu @@ -12,41 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "LstmCompute.h" #include "hl_recurrent_apply.cuh" namespace paddle { template <> -void LstmCompute::forwardBatch<1>(hl_lstm_value value, int frameSize, - int batchSize) { - hl_gpu_lstm_forward(hppl::forward::lstm(), value, frameSize, - batchSize, activeNode_, activeGate_, +void LstmCompute::forwardBatch<1>(hl_lstm_value value, + int frameSize, + int batchSize) { + hl_gpu_lstm_forward(hppl::forward::lstm(), + value, + frameSize, + batchSize, + activeNode_, + activeGate_, activeState_); } template <> -void LstmCompute::backwardBatch<1>(hl_lstm_value value, hl_lstm_grad grad, - int frameSize, int batchSize) { - hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad, - frameSize, batchSize, activeNode_, - activeGate_, activeState_); +void LstmCompute::backwardBatch<1>(hl_lstm_value value, + hl_lstm_grad grad, + int frameSize, + int batchSize) { + hl_gpu_lstm_backward(hppl::backward::lstm(), + value, + grad, + frameSize, + batchSize, + activeNode_, + activeGate_, + activeState_); } template <> void LstmCompute::forwardOneSequence<1>(hl_lstm_value value, int frameSize) { - hl_gpu_lstm_forward(hppl::forward::lstm(), value, - frameSize, /* batchSize */ 1, - activeNode_, activeGate_, activeState_); + hl_gpu_lstm_forward(hppl::forward::lstm(), + value, + frameSize, + /* batchSize */ 1, + activeNode_, + activeGate_, + activeState_); } template <> -void LstmCompute::backwardOneSequence<1>(hl_lstm_value value, hl_lstm_grad grad, +void LstmCompute::backwardOneSequence<1>(hl_lstm_value value, + hl_lstm_grad grad, int frameSize) { - hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad, - frameSize, /* batchSize */ 1, - activeNode_, activeGate_, activeState_); + hl_gpu_lstm_backward(hppl::backward::lstm(), + value, + grad, + frameSize, + /* batchSize */ 1, + activeNode_, + activeGate_, + activeState_); } } // namespace paddle diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu index ba2b47d6cc6961a380b7db2781b4d214dea829db..5435808fb7f70fdf1ac98815f7fe8890fb85527c 100644 --- a/paddle/math/BaseMatrix.cu +++ b/paddle/math/BaseMatrix.cu @@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include #include +#include +#include #include "BaseMatrix.h" -#include "hl_matrix_ops.cuh" -#include "hl_matrix_base.cuh" -#include "hl_matrix_apply.cuh" -#include "SIMDFunctions.h" #include "MathFunctions.h" +#include "SIMDFunctions.h" +#include "hl_matrix_apply.cuh" +#include "hl_matrix_base.cuh" +#include "hl_matrix_ops.cuh" namespace paddle { const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported."; -template +template template int BaseMatrixT::applyUnary(Op op) { MatrixOffset offset(0, 0); @@ -34,9 +34,11 @@ int BaseMatrixT::applyUnary(Op op) { return 0; } -template +template template -int BaseMatrixT::applyUnary(Op op, int numRows, int numCols, +int BaseMatrixT::applyUnary(Op op, + int numRows, + int numCols, MatrixOffset& offset) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; int dimM = numRows; @@ -56,7 +58,7 @@ int BaseMatrixT::applyUnary(Op op, int numRows, int numCols, return 0; } -template +template template int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b) { CHECK(height_ == b.height_ && width_ == b.width_) @@ -67,18 +69,23 @@ int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b) { return 0; } -template +template template -int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, - MatrixOffset& offset) { +int BaseMatrixT::applyBinary( + Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) { applyBinary(op, b, numRows, numCols, offset, false_type(), false_type()); return 0; } -template +template template -int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, - MatrixOffset& offset, bAsRowVector, bAsColVector) { +int BaseMatrixT::applyBinary(Op op, + BaseMatrixT& b, + int numRows, + int numCols, + MatrixOffset& offset, + bAsRowVector, + bAsColVector) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch"; @@ -91,8 +98,8 @@ int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, T* A = data_; T* B = b.data_; CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimN + offset.aCol_, this->width_); if (!bAsRowVector::value && !bAsColVector::value) { @@ -115,7 +122,7 @@ int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, return 0; } -template +template template int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) { CHECK_EQ(height_, b.height_); @@ -129,21 +136,29 @@ int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) { return 0; } -template +template template -int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, - int numRows, int numCols, +int BaseMatrixT::applyTernary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, + int numRows, + int numCols, MatrixOffset& offset) { applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type()); return 0; } -template +template template -int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, - int numRows, int numCols, MatrixOffset& offset, - cAsRowVector, cAsColVector) { +int BaseMatrixT::applyTernary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, + int numRows, + int numCols, + MatrixOffset& offset, + cAsRowVector, + cAsColVector) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR; @@ -160,10 +175,10 @@ int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, T* B = b.data_; T* C = c.data_; CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); - CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, - offset.cRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); + CAL_MATRIX_START_ADDRESS( + C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_); CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimN + offset.aCol_, this->width_); @@ -180,21 +195,21 @@ int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, } if (true == useGpu_) { - hl_gpu_apply_ternary_op - ( + hl_gpu_apply_ternary_op( op, A, B, C, dimM, dimN, lda, ldb, ldc); } else { - hl_cpu_apply_ternary_op - ( + hl_cpu_apply_ternary_op( op, A, B, C, dimM, dimN, lda, ldb, ldc); } return 0; } -template +template template -int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, +int BaseMatrixT::applyQuaternary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, BaseMatrixT& d) { CHECK_EQ(height_, b.height_); CHECK_EQ(width_, b.width_); @@ -209,10 +224,14 @@ int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, return 0; } -template +template template -int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, - BaseMatrixT& d, int numRows, int numCols, +int BaseMatrixT::applyQuaternary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, + BaseMatrixT& d, + int numRows, + int numCols, MatrixOffset& offset) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; @@ -234,12 +253,12 @@ int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, T* C = c.data_; T* D = d.data_; CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); - CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, - offset.cRow_); - CAL_MATRIX_START_ADDRESS(D, d.height_, d.width_, ldd, offset.dCol_, - offset.dRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); + CAL_MATRIX_START_ADDRESS( + C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_); + CAL_MATRIX_START_ADDRESS( + D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_); CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimN + offset.aCol_, this->width_); @@ -250,22 +269,29 @@ int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, CHECK_LE(dimM + offset.dRow_, d.height_); CHECK_LE(dimN + offset.dCol_, d.width_); if (true == useGpu_) { - hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, - ldc, ldd); + hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd); } else { - hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, - ldc, ldd); + hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd); } return 0; } -template -template +template -int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, - int numRows, int numCols, MatrixOffset& offset, - aAsRowVector, aAsColVector) { +int BaseMatrixT::aggregate(Agg agg, + Op op, + Saver sv, + BaseMatrixT& b, + int numRows, + int numCols, + MatrixOffset& offset, + aAsRowVector, + aAsColVector) { CHECK_EQ(useGpu_, b.useGpu_); int ld = stride_; @@ -273,10 +299,10 @@ int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, T* dst = data_; T* B = b.data_; - CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_, - offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); + CAL_MATRIX_START_ADDRESS( + dst, height_, width_, ld, offset.aCol_, offset.aRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); if (aAsRowVector::value && !aAsColVector::value) { if (useGpu_) { @@ -297,12 +323,21 @@ int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, return 0; } -template -template +template -int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, - BaseMatrixT& c, int numRows, int numCols, - MatrixOffset& offset, aAsRowVector, +int BaseMatrixT::aggregate(Agg agg, + Op op, + Saver sv, + BaseMatrixT& b, + BaseMatrixT& c, + int numRows, + int numCols, + MatrixOffset& offset, + aAsRowVector, aAsColVector) { CHECK_EQ(useGpu_, b.useGpu_); CHECK_EQ(useGpu_, c.useGpu_); @@ -314,28 +349,28 @@ int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, T* dst = data_; T* B = b.data_; T* C = c.data_; - CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_, - offset.aRow_); - CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, - offset.bRow_); - CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, - offset.cRow_); + CAL_MATRIX_START_ADDRESS( + dst, height_, width_, ld, offset.aCol_, offset.aRow_); + CAL_MATRIX_START_ADDRESS( + B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); + CAL_MATRIX_START_ADDRESS( + C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_); if (aAsRowVector::value && !aAsColVector::value) { if (useGpu_) { - hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, - ldb, C, ldc); + hl_gpu_matrix_column_op( + agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc); } else { - hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, - ldb, C, ldc); + hl_cpu_matrix_column_op( + agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc); } } else if (!aAsRowVector::value && aAsColVector::value) { if (useGpu_) { - hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, - ldb, C, ldc); + hl_gpu_matrix_row_op( + agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc); } else { - hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, - ldb, C, ldc); + hl_cpu_matrix_row_op( + agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc); } } else { LOG(FATAL) << "not supported"; @@ -350,15 +385,19 @@ int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, */ DEFINE_MATRIX_UNARY_OP(Neg, a = -a); -template -void BaseMatrixT::neg() { applyUnary(unary::Neg()); } +template +void BaseMatrixT::neg() { + applyUnary(unary::Neg()); +} DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a)); -template<> -void BaseMatrixT::exp2() { applyUnary(unary::Exp()); } +template <> +void BaseMatrixT::exp2() { + applyUnary(unary::Exp()); +} DEFINE_MATRIX_UNARY_OP(Log, a = log(a)); -template<> +template <> void BaseMatrixT::log2() { if (useGpu_) { applyUnary(unary::Log()); @@ -368,30 +407,42 @@ void BaseMatrixT::log2() { } DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a)); -template<> -void BaseMatrixT::sqrt2() { applyUnary(unary::Sqrt()); } +template <> +void BaseMatrixT::sqrt2() { + applyUnary(unary::Sqrt()); +} DEFINE_MATRIX_UNARY_OP(Square, a = a * a); -template -void BaseMatrixT::square2() { applyUnary(unary::Square()); } +template +void BaseMatrixT::square2() { + applyUnary(unary::Square()); +} DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a); -template -void BaseMatrixT::reciprocal2() { applyUnary(unary::Reciprocal()); } +template +void BaseMatrixT::reciprocal2() { + applyUnary(unary::Reciprocal()); +} DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a); -template -void BaseMatrixT::abs2() { applyUnary(unary::Abs()); } +template +void BaseMatrixT::abs2() { + applyUnary(unary::Abs()); +} DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0)); -template -void BaseMatrixT::sign2() { applyUnary(unary::Sign()); } +template +void BaseMatrixT::sign2() { + applyUnary(unary::Sign()); +} DEFINE_MATRIX_UNARY_OP(Zero, a = 0); -template -void BaseMatrixT::zero() { applyUnary(unary::Zero()); } +template +void BaseMatrixT::zero() { + applyUnary(unary::Zero()); +} -template +template void BaseMatrixT::zeroAtOffset(int64_t columnOffset, int64_t numColumns) { int numRows = height_; int numCols = numColumns; @@ -400,11 +451,13 @@ void BaseMatrixT::zeroAtOffset(int64_t columnOffset, int64_t numColumns) { } DEFINE_MATRIX_UNARY_OP(One, a = 1); -template -void BaseMatrixT::one() { applyUnary(unary::One()); } +template +void BaseMatrixT::one() { + applyUnary(unary::One()); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p)); -template<> +template <> void BaseMatrixT::pow2(real p) { if (useGpu_) { applyUnary(unary::Pow(p)); @@ -414,51 +467,67 @@ void BaseMatrixT::pow2(real p) { } DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p); -template -void BaseMatrixT::subScalar(T p) { applyUnary(unary::SubScalar(p)); } +template +void BaseMatrixT::subScalar(T p) { + applyUnary(unary::SubScalar(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p); -template -void BaseMatrixT::mulScalar(T p) { applyUnary(unary::MulScalar(p)); } +template +void BaseMatrixT::mulScalar(T p) { + applyUnary(unary::MulScalar(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p); -template -void BaseMatrixT::divScalar(T p) { applyUnary(unary::DivScalar(p)); } +template +void BaseMatrixT::divScalar(T p) { + applyUnary(unary::DivScalar(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p); -template -void BaseMatrixT::assign(T p) { applyUnary(unary::Assign(p)); } +template +void BaseMatrixT::assign(T p) { + applyUnary(unary::Assign(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p); -template -void BaseMatrixT::add(T p) { applyUnary(unary::Add(p)); } +template +void BaseMatrixT::add(T p) { + applyUnary(unary::Add(p)); +} DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2); -template -void BaseMatrixT::add(T p1, T p2) { applyUnary(unary::Add2(p1, p2)); } +template +void BaseMatrixT::add(T p1, T p2) { + applyUnary(unary::Add2(p1, p2)); +} -DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER, +DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, + TWO_PARAMETER, a = a < p1 ? p1 : (a > p2 ? p2 : a)); -template -void BaseMatrixT::clip(T p1, T p2) { applyUnary(unary::Clip(p1, p2)); } +template +void BaseMatrixT::clip(T p1, T p2) { + applyUnary(unary::Clip(p1, p2)); +} -DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, TWO_PARAMETER, - a = b < p1 ? 0 : (b > p2 ? 0 : 1)); -template +DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, + TWO_PARAMETER, + a = b < p1 ? 0 : (b > p2 ? 0 : 1)); +template void BaseMatrixT::clipDerivative(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::ClipDerivative(p1, p2), b); } -DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER, +DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, + ONE_PARAMETER, a = a > p ? 1.0f : 0.0f); -template +template void BaseMatrixT::biggerThanScalar(T p) { applyUnary(unary::BiggerThanScalar(p)); } -DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, - a = a > p ? a : p); -template +DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p); +template void BaseMatrixT::downClip(T p) { applyUnary(unary::DownClip(p)); } @@ -469,12 +538,12 @@ void BaseMatrixT::downClip(T p) { */ DEFINE_MATRIX_BINARY_OP(Add, a += b); -template +template void BaseMatrixT::add(BaseMatrixT& b) { applyBinary(binary::Add(), b); } -template<> +template <> void BaseMatrixT::add(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Add(), b); @@ -485,7 +554,7 @@ void BaseMatrixT::add(BaseMatrixT& b) { } } -template +template void BaseMatrixT::addAtOffset(BaseMatrixT& b, int64_t columnOffset) { if (columnOffset + b.width_ <= width_) { int numRows = height_; @@ -504,43 +573,53 @@ void BaseMatrixT::addAtOffset(BaseMatrixT& b, int64_t columnOffset) { } } -template +template void BaseMatrixT::addP2P(BaseMatrixT& b) { T* A = data_; T* B = b.data_; int dimM = height_; int dimN = width_; - hl_gpu_apply_binary_op, 0, 0> - (binary::Add(), A, B, dimM, dimN, dimN, dimN); + hl_gpu_apply_binary_op, 0, 0>( + binary::Add(), A, B, dimM, dimN, dimN, dimN); } -template +template void BaseMatrixT::addColVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::Add(), b, numRows, numCols, offset, false_type(), + applyBinary(binary::Add(), + b, + numRows, + numCols, + offset, + false_type(), true_type() /* bAsColVector */); } -template +template void BaseMatrixT::addRowVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::Add(), b, numRows, numCols, offset, - true_type() /* bAsRowVector */, false_type()); + applyBinary(binary::Add(), + b, + numRows, + numCols, + offset, + true_type() /* bAsRowVector */, + false_type()); } DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p); -template +template void BaseMatrixT::add(BaseMatrixT& b, T p) { applyBinary(binary::Add1(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p)); -template<> +template <> void BaseMatrixT::pow2(BaseMatrixT& b, real p) { if (useGpu_) { applyBinary(binary::Pow(p), b); @@ -550,36 +629,45 @@ void BaseMatrixT::pow2(BaseMatrixT& b, real p) { } DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b); -template +template void BaseMatrixT::add(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::Add2(p1, p2), b); } -template +template void BaseMatrixT::addBias(BaseMatrixT& b, T scale) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::Add1(scale), b, numRows, numCols, offset, - true_type() /* bAsRowVector */, false_type()); + applyBinary(binary::Add1(scale), + b, + numRows, + numCols, + offset, + true_type() /* bAsRowVector */, + false_type()); } DEFINE_MATRIX_BINARY_OP(Sub, a -= b); -template -void BaseMatrixT::sub(BaseMatrixT& b) { applyBinary(binary::Sub(), b); } +template +void BaseMatrixT::sub(BaseMatrixT& b) { + applyBinary(binary::Sub(), b); +} DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p); -template +template void BaseMatrixT::sub(BaseMatrixT& b, T p) { applyBinary(binary::Sub1(p), b); } DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f); -template -void BaseMatrixT::relu(BaseMatrixT& b) { applyBinary(binary::Relu(), b); } +template +void BaseMatrixT::relu(BaseMatrixT& b) { + applyBinary(binary::Relu(), b); +} DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f)); -template +template void BaseMatrixT::reluDerivative(BaseMatrixT& b) { applyBinary(binary::ReluDerivative(), b); } @@ -589,7 +677,7 @@ DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0; ? THRESHOLD : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))); -template<> +template <> void BaseMatrixT::softrelu(BaseMatrixT& b) { applyBinary(binary::Softrelu(), b); } @@ -599,97 +687,100 @@ DEFINE_MATRIX_BINARY_OP( a *= (1.0 - exp(-1.0 * ((b > THRESHOLD) ? THRESHOLD : ((b < -THRESHOLD) ? (-THRESHOLD) : b))))); -template<> +template <> void BaseMatrixT::softreluDerivative(BaseMatrixT& b) { applyBinary(binary::SoftreluDerivative(), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1; b = b < p2 ? b : p2); -template +template void BaseMatrixT::brelu(BaseMatrixT& b) { - int p1 = 0, p2 = 24; //! TODO(yuyang18): Make p1,p2 configuable. + int p1 = 0, p2 = 24; //! TODO(yuyang18): Make p1,p2 configuable. applyBinary(binary::Brelu(p1, p2), b); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative, TWO_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative, + TWO_PARAMETER, a *= (b > p1 && b < p2) ? 1.0 : 0.0); -template +template void BaseMatrixT::breluDerivative(BaseMatrixT& b) { int p1 = 0, p2 = 24; applyBinary(binary::BreluDerivative(p1, p2), b); } DEFINE_MATRIX_BINARY_OP(Square, b = a * a); -template +template void BaseMatrixT::square2(BaseMatrixT& b) { applyBinary(binary::Square(), b); } DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b); -template +template void BaseMatrixT::squareDerivative(BaseMatrixT& b) { applyBinary(binary::SquareDerivative(), b); } -DEFINE_MATRIX_BINARY_OP(Tanh, - T tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - b = 2.0 / (1.0 + std::exp(tmp)) - 1.0); -template<> +DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + b = 2.0 / (1.0 + std::exp(tmp)) - 1.0); +template <> void BaseMatrixT::tanh(BaseMatrixT& b) { applyBinary(binary::Tanh(), b); } DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b); -template +template void BaseMatrixT::tanhDerivative(BaseMatrixT& b) { applyBinary(binary::TanhDerivative(), b); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanh, TWO_PARAMETER, - b = p1 * - (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)); -template<> +DEFINE_MATRIX_BINARY_PARAMETER_OP( + ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)); +template <> void BaseMatrixT::scaledTanh(BaseMatrixT& b, real p1, real p2) { applyBinary(binary::ScaledTanh(p1, p2), b); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative, TWO_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative, + TWO_PARAMETER, a *= p2 * (p1 - b * b)); -template +template void BaseMatrixT::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::ScaledTanhDerivative(p1 * p1, p2 / p1), b); } DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a); -template +template void BaseMatrixT::reciprocal2(BaseMatrixT& b) { applyBinary(binary::Reciprocal(), b); } DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b); -template +template void BaseMatrixT::reciprocalDerivative(BaseMatrixT& b) { applyBinary(binary::ReciprocalDerivative(), b); } DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a); -template -void BaseMatrixT::abs2(BaseMatrixT& b) { applyBinary(binary::Abs(), b); } +template +void BaseMatrixT::abs2(BaseMatrixT& b) { + applyBinary(binary::Abs(), b); +} DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0); -template +template void BaseMatrixT::absDerivative(BaseMatrixT& b) { applyBinary(binary::AbsDerivative(), b); } -DEFINE_MATRIX_BINARY_OP( - Sigmoid, const T THRESHOLD_MIN = -40.0; const T THRESHOLD_MAX = 13.0; - T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN - : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a); - b = 1.0f / (1.0f + exp(-tmp))); -template<> +DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0; + const T THRESHOLD_MAX = 13.0; + T tmp = (a < THRESHOLD_MIN) + ? THRESHOLD_MIN + : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a); + b = 1.0f / (1.0f + exp(-tmp))); +template <> void BaseMatrixT::sigmoid(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Sigmoid(), b); @@ -723,31 +814,31 @@ void BaseMatrixT::sigmoid(BaseMatrixT& b) { } DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b)); -template +template void BaseMatrixT::sigmoidDerivative(BaseMatrixT& b) { applyBinary(binary::SigmoidDerivative(), b); } DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b); -template +template void BaseMatrixT::expDerivative(BaseMatrixT& b) { applyBinary(binary::ExpDerivative(), b); } DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f); -template +template void BaseMatrixT::sign2(BaseMatrixT& b) { applyBinary(binary::Sign(), b); } DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b)); -template<> +template <> void BaseMatrixT::exp2(BaseMatrixT& b) { applyBinary(binary::Exp(), b); } DEFINE_MATRIX_BINARY_OP(Log, a = log(b)); -template<> +template <> void BaseMatrixT::log2(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Log(), b); @@ -757,13 +848,13 @@ void BaseMatrixT::log2(BaseMatrixT& b) { } DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b)); -template<> +template <> void BaseMatrixT::sqrt2(BaseMatrixT& b) { applyBinary(binary::Sqrt(), b); } DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b)); -template<> +template <> void BaseMatrixT::invSqrt(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::InvSqrt(), b); @@ -775,37 +866,37 @@ void BaseMatrixT::invSqrt(BaseMatrixT& b) { } DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p)); -template +template void BaseMatrixT::isEqualTo(BaseMatrixT& b, T value) { applyBinary(binary::IsEqual(value), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p); -template +template void BaseMatrixT::addScalar(BaseMatrixT& b, T p) { applyBinary(binary::AddScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p); -template +template void BaseMatrixT::subScalar(BaseMatrixT& b, T p) { applyBinary(binary::SubScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p); -template +template void BaseMatrixT::mulScalar(BaseMatrixT& b, T p) { applyBinary(binary::MulScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p); -template +template void BaseMatrixT::divScalar(BaseMatrixT& b, T p) { applyBinary(binary::DivScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b); -template +template void BaseMatrixT::scalarDiv(BaseMatrixT& b, T p) { applyBinary(binary::ScalarDiv(p), b); } @@ -817,20 +908,20 @@ void BaseMatrixT::scalarDiv(BaseMatrixT& b, T p) { DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy, a = -c * log(b) - (1 - c) * log(1 - b)); -template<> +template <> void BaseMatrixT::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::SoftCrossEntropy(), b, c); } DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b))); -template +template void BaseMatrixT::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::SoftCrossEntropyBp(), b, c); } DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy, a = c > 0.5 ? -log(b) : -log(1.0 - b)); -template<> +template <> void BaseMatrixT::binaryLabelCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) { if (useGpu_) { @@ -858,70 +949,73 @@ void BaseMatrixT::binaryLabelCrossEntropy(BaseMatrixT& b, DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp, a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)); -template +template void BaseMatrixT::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::BinaryCrossEntropyBp(), b, c); } DEFINE_MATRIX_TERNARY_OP(Add, a = b + c); -template +template void BaseMatrixT::add(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Add(), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c); -template +template void BaseMatrixT::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) { applyTernary(ternary::Add1(p1, p2), b, c); } DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c); -template +template void BaseMatrixT::sub(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Sub(), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c); -template +template void BaseMatrixT::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) { applyTernary(ternary::Sub1(p1, p2), b, c); } DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c); -template +template void BaseMatrixT::add2(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Add2(), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3, THREE_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3, + THREE_PARAMETER, a = p1 * a + p2 * b + p3 * c); -template +template void BaseMatrixT::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) { applyTernary(ternary::Add3(p1, p2, p3), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate, + THREE_PARAMETER, c = p2 * c - p1 * (b + p3 * a); a = a + c); -template +template void BaseMatrixT::sgdUpdate(BaseMatrixT& b, // grad BaseMatrixT& c, // mom - T p1, // learningRate, - T p2, // momentum, - T p3) { // decayRate + T p1, // learningRate, + T p2, // momentum, + T p3) { // decayRate applyTernary(ternary::SgdUpdate(p1, p2, p3), b, c); } -DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER, +DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate, + THREE_PARAMETER, c = p2 * c - p1 * d * (b + p3 * a); a += c); -template +template void BaseMatrixT::sgdUpdate(BaseMatrixT& b, // grad, BaseMatrixT& c, // mom, BaseMatrixT& d, // lr, - T p1, // learningRate, - T p2, // momentum, - T p3) { // decayRate + T p1, // learningRate, + T p2, // momentum, + T p3) { // decayRate applyQuaternary(quaternary::SgdUpdate(p1, p2, p3), b, c, d); } @@ -929,19 +1023,22 @@ DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b; a = (a > lambda) ? (a - lambda) : (a < -lambda) ? (a + lambda) : 0); -template +template void BaseMatrixT::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) { applyBinary(binary::ApplyL1(learningRate * decayRate), lr); } -template<> +template <> void BaseMatrixT::applyL1(BaseMatrixT& lr, real learningRate, real decayRate) { if (useGpu_) { applyBinary(binary::ApplyL1(learningRate * decayRate), lr); } else { - simd::decayL1(this->data_, this->data_, lr.data_, learningRate * decayRate, + simd::decayL1(this->data_, + this->data_, + lr.data_, + learningRate * decayRate, height_ * width_); } } @@ -950,24 +1047,25 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p; a = (a > lambda) ? (a - lambda) : (a < -lambda) ? (a + lambda) : 0); -template +template void BaseMatrixT::applyL1(T learningRate, T decayRate) { applyUnary(unary::ApplyL1(learningRate * decayRate)); } -template<> +template <> void BaseMatrixT::applyL1(real learningRate, real decayRate) { if (useGpu_) { applyUnary(unary::ApplyL1(learningRate * decayRate)); } else { - simd::decayL1(this->data_, this->data_, learningRate * decayRate, - height_ * width_); + simd::decayL1( + this->data_, this->data_, learningRate * decayRate, height_ * width_); } } -DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2, ONE_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2, + ONE_PARAMETER, a *= (1.0f / (1.0f + p * b))); -template +template void BaseMatrixT::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) { if (useGpu_) { applyBinary(binary::ApplyL2(learningRate * decayRate), lr); @@ -980,32 +1078,33 @@ void BaseMatrixT::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) { } } -template +template void BaseMatrixT::applyL2(T learningRate, T decayRate) { BaseMatrixT::mulScalar(1.0f / (1.0f + learningRate * decayRate)); } DEFINE_MATRIX_BINARY_OP(DotMul, a *= b); -template +template void BaseMatrixT::dotMul(BaseMatrixT& b) { applyBinary(binary::DotMul(), b); } DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c); -template +template void BaseMatrixT::dotMul(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotMul(), b, c); } DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c); -template +template void BaseMatrixT::dotDiv(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotDiv(), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P, + TWO_PARAMETER, a = (b + p1) / (c + p2)); -template +template void BaseMatrixT::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::DotDiv2P(p1, p2), b, c); } @@ -1015,7 +1114,7 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c; ? THRESHOLD : ((a < -THRESHOLD) ? (-THRESHOLD) : a); a = log(1 + exp(a)) - a * d); -template<> +template <> void BaseMatrixT::rankLoss(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d) { @@ -1026,8 +1125,9 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c; a = (a > THRESHOLD) ? THRESHOLD : ((a < -THRESHOLD) ? (-THRESHOLD) : a); - a = exp(a); a = (a / (1 + a) - d)); -template<> + a = exp(a); + a = (a / (1 + a) - d)); +template <> void BaseMatrixT::rankLossBp(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d) { @@ -1040,7 +1140,7 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0; ? -THRESHOLD : b; a = log(1 + exp(x)) - c * x); -template<> +template <> void BaseMatrixT::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::LogisticRegressionLoss(), b, c); } @@ -1050,22 +1150,23 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0; T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD) ? -THRESHOLD : b; - x = exp(x); a = x / (1 + x) - c); -template<> + x = exp(x); + a = x / (1 + x) - c); +template <> void BaseMatrixT::logisticRegressionLossBp(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::LogisticRegressionLossBp(), b, c); } DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f); -template +template void BaseMatrixT::biggerThan(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::BiggerThan(), b, c); } DEFINE_MATRIX_QUATERNARY_OP( BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f); -template +template void BaseMatrixT::biggerThan(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d) { @@ -1073,25 +1174,34 @@ void BaseMatrixT::biggerThan(BaseMatrixT& b, } DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c); -template +template void BaseMatrixT::max2(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Max(), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError, ONE_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError, + ONE_PARAMETER, c += ((a > p) == (b > p)) ? 0.0f : 1.0f); -template -void BaseMatrixT::binaryClassificationError2(size_t destCol, BaseMatrixT& b, - BaseMatrixT& c, T p) { +template +void BaseMatrixT::binaryClassificationError2(size_t destCol, + BaseMatrixT& b, + BaseMatrixT& c, + T p) { CHECK(!useGpu_) << "do not support gpu"; MatrixOffset offset(0, 0, 0, 0, destCol, 0); int numRows = b.height_; int numCols = b.width_; - b.applyTernary(ternary::BinaryClassificationError(p), c, *this, numRows, - numCols, offset, false_type(), true_type() /*cAsColVector*/); + b.applyTernary(ternary::BinaryClassificationError(p), + c, + *this, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } -template<> +template <> void BaseMatrixT::binaryClassificationError(size_t destCol, BaseMatrixT& b, BaseMatrixT& c, @@ -1099,127 +1209,148 @@ void BaseMatrixT::binaryClassificationError(size_t destCol, MatrixOffset offset(destCol, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; - aggregate(aggregate::sum(), base::binary::classificationError(p), - base::binary::add(), b, c, numRows, numCols, offset, false_type(), + aggregate(aggregate::sum(), + base::binary::classificationError(p), + base::binary::add(), + b, + c, + numRows, + numCols, + offset, + false_type(), true_type() /*aAsColVector*/); } -DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3, THREE_PARAMETER, +DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3, + THREE_PARAMETER, a = p1 * b + p2 * c + p3 * d); -template -void BaseMatrixT::add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, - T p2, T p3) { +template +void BaseMatrixT::add3( + BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) { applyQuaternary(quaternary::Add3(p1, p2, p3), b, c, d); } DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c); -template +template void BaseMatrixT::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotMulSquare(), b, c); } DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c); -template +template void BaseMatrixT::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotSquareSquare(), b, c); } DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b); -template +template void BaseMatrixT::dotMulSquare(BaseMatrixT& b) { applyBinary(binary::DotMulSquare(), b); } DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b); -template +template void BaseMatrixT::dotSquareMul(BaseMatrixT& b) { applyBinary(binary::DotSquareMul(), b); } -DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum, THREE_PARAMETER, +DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum, + THREE_PARAMETER, T tmp = p1 * b + p2 * c + p3 * d; a += tmp * tmp); -template -void BaseMatrixT::addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, - T p1, T p2, T p3) { +template +void BaseMatrixT::addSquareSum( + BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) { applyQuaternary(quaternary::AddSquareSum(p1, p2, p3), b, c, d); } DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b); -template +template void BaseMatrixT::addSquare(BaseMatrixT& b, T p) { applyBinary(binary::AddSquare(p), b); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare, TWO_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare, + TWO_PARAMETER, a = p1 * a + p2 * b * b); -template +template void BaseMatrixT::decayAddSquare(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::DecayAddSquare(p1, p2), b); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul, + TWO_PARAMETER, a = p1 * a + p2 * b * b * c * c); -template -void BaseMatrixT::decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1, +template +void BaseMatrixT::decayAddSquareMul(BaseMatrixT& b, + BaseMatrixT& c, + T p1, T p2) { applyTernary(ternary::DecayAddSquareMul(p1, p2), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum, THREE_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum, + THREE_PARAMETER, a = 1 / (p1 * b + p2 * c + p3)); -template -void BaseMatrixT::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, - T p3) { +template +void BaseMatrixT::reciprocalSum( + BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) { applyTernary(ternary::ReciprocalSum(p1, p2, p3), b, c); } -DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER, +DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, + TWO_PARAMETER, a = 1 / (p1 * b + p2)); -template +template void BaseMatrixT::reciprocal2(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::Reciprocal2(p1, p2), b); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum, + TWO_PARAMETER, T tmp = p1 * b + p2 * c; a *= tmp * tmp); -template -void BaseMatrixT::dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, +template +void BaseMatrixT::dotMulSquareSum(BaseMatrixT& b, + BaseMatrixT& c, + T p1, T p2) { applyTernary(ternary::DotMulSquareSum(p1, p2), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum, + TWO_PARAMETER, T tmp = p1 * b + p2 * c; a = tmp * tmp); -template +template void BaseMatrixT::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::DotSquareSum(p1, p2), b, c); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum, + TWO_PARAMETER, a *= p1 * b + p2 * c); -template +template void BaseMatrixT::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::DotMulSum(p1, p2), b, c); } DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0); -template +template void BaseMatrixT::copyAndClear(BaseMatrixT& b) { applyBinary(binary::CopyAndClear(), b); } -DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul, TWO_PARAMETER, +DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul, + TWO_PARAMETER, a = p1 * a + p2 * b * c); -template +template void BaseMatrixT::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::AddDotMul(p1, p2), b, c); } DEFINE_MATRIX_BINARY_OP(Assign, a = b;); -template +template void BaseMatrixT::assign(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Assign(), b); @@ -1230,7 +1361,7 @@ void BaseMatrixT::assign(BaseMatrixT& b) { } } -template +template void BaseMatrixT::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) { if (columnOffset + b.width_ <= width_) { int numRows = height_; @@ -1250,24 +1381,31 @@ void BaseMatrixT::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) { } DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp); -template +template void BaseMatrixT::deepSwap(BaseMatrixT& b) { - applyBinary(binary::DeepSwap(), b); + applyBinary(binary::DeepSwap(), b); } -template<> +template <> void BaseMatrixT::rowDotMul(size_t destCol, BaseMatrixT& b, BaseMatrixT& c) { int numRows = b.height_; int numCols = b.width_; MatrixOffset offset(destCol, 0, 0, 0, 0, 0); - aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c, - numRows, numCols, offset, false_type(), + aggregate(aggregate::sum(), + base::binary::mul(), + base::binary::add(), + b, + c, + numRows, + numCols, + offset, + false_type(), true_type() /*aAsColVector*/); } -template +template void BaseMatrixT::rowDotMul2(size_t destCol, BaseMatrixT& b, BaseMatrixT& c) { @@ -1290,17 +1428,24 @@ void BaseMatrixT::rowDotMul2(size_t destCol, } } -template<> +template <> void BaseMatrixT::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; - aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c, - numRows, numCols, offset, true_type() /*aAsRowVector*/, + aggregate(aggregate::sum(), + base::binary::mul(), + base::binary::add(), + b, + c, + numRows, + numCols, + offset, + true_type() /*aAsRowVector*/, false_type()); } -template +template void BaseMatrixT::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) { CHECK(!useGpu_) << "do not support gpu"; @@ -1321,16 +1466,22 @@ void BaseMatrixT::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) { } DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c); -template +template void BaseMatrixT::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::addDotMulMMV(), b, c, numRows, numCols, offset, - true_type() /*cAsRowVector*/, false_type()); + applyTernary(ternary::addDotMulMMV(), + b, + c, + numRows, + numCols, + offset, + true_type() /*cAsRowVector*/, + false_type()); } -template +template void BaseMatrixT::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) { CHECK(!useGpu_) << "do not support gpu"; @@ -1350,16 +1501,22 @@ void BaseMatrixT::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) { } } -template +template void BaseMatrixT::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::DotMul(), b, c, numRows, numCols, offset, - false_type(), true_type() /*cAsColVector*/); + applyTernary(ternary::DotMul(), + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } -template +template void BaseMatrixT::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { CHECK(!useGpu_) << "do not support gpu"; @@ -1379,52 +1536,82 @@ void BaseMatrixT::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { } } -template +template void BaseMatrixT::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, cRow); int numRows = height_; int numCols = width_; - applyTernary(ternary::DotMul(), b, c, numRows, numCols, offset, - true_type() /* cAsRowVector */, false_type() /* cAsColVector */); + applyTernary(ternary::DotMul(), + b, + c, + numRows, + numCols, + offset, + true_type() /* cAsRowVector */, + false_type() /* cAsColVector */); } -template +template void BaseMatrixT::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, cRow); int numRows = height_; int numCols = width_; - applyTernary(ternary::addDotMulMMV(), b, c, numRows, numCols, offset, - true_type() /* cAsRowVector */, false_type() /* cAsColVector */); + applyTernary(ternary::addDotMulMMV(), + b, + c, + numRows, + numCols, + offset, + true_type() /* cAsRowVector */, + false_type() /* cAsColVector */); } -template +template void BaseMatrixT::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::addDotMulMMV(), b, c, numRows, numCols, offset, - false_type(), true_type() /*cAsColVector*/); + applyTernary(ternary::addDotMulMMV(), + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c); -template +template void BaseMatrixT::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::RowAdd(p), b, c, numRows, numCols, offset, - false_type(), true_type() /*cAsColVector*/); + applyTernary(ternary::RowAdd(p), + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c)); -template<> +template <> void BaseMatrixT::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { if (useGpu_) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; - applyTernary(ternary::RowPow(), b, c, numRows, numCols, offset, - false_type(), true_type() /*cAsColVector*/); + applyTernary(ternary::RowPow(), + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*cAsColVector*/); } else { size_t height = this->height_; size_t width = this->width_; @@ -1441,44 +1628,64 @@ void BaseMatrixT::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { } } -template +template void BaseMatrixT::mulRowVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::DotMul(), b, numRows, numCols, offset, - true_type() /* bAsRowVector */, false_type()); + applyBinary(binary::DotMul(), + b, + numRows, + numCols, + offset, + true_type() /* bAsRowVector */, + false_type()); } DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b); -template +template void BaseMatrixT::divRowVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::DotDiv(), b, numRows, numCols, offset, - true_type() /* bAsRowVector */, false_type()); + applyBinary(binary::DotDiv(), + b, + numRows, + numCols, + offset, + true_type() /* bAsRowVector */, + false_type()); } -template +template void BaseMatrixT::mulColVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::DotMul(), b, numRows, numCols, offset, - false_type(), true_type() /* bAsColVector */); + applyBinary(binary::DotMul(), + b, + numRows, + numCols, + offset, + false_type(), + true_type() /* bAsColVector */); } -template +template void BaseMatrixT::divColVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; - applyBinary(binary::DotDiv(), b, numRows, numCols, offset, - false_type(), true_type() /* bAsColVector */); + applyBinary(binary::DotDiv(), + b, + numRows, + numCols, + offset, + false_type(), + true_type() /* bAsColVector */); } -template<> +template <> template int BaseMatrixT::applyRow(Agg agg, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); @@ -1486,13 +1693,20 @@ int BaseMatrixT::applyRow(Agg agg, BaseMatrixT& b) { size_t numCols = b.width_; CHECK_EQ(height_, numRows); CHECK_EQ(width_, 1UL); - aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows, - numCols, offset, false_type(), true_type() /*aAsColVector*/); + aggregate(agg, + base::unary::identity(), + base::binary::second(), + b, + numRows, + numCols, + offset, + false_type(), + true_type() /*aAsColVector*/); return 0; } -template<> +template <> template int BaseMatrixT::applyRow(Agg agg, Saver sv, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); @@ -1500,16 +1714,25 @@ int BaseMatrixT::applyRow(Agg agg, Saver sv, BaseMatrixT& b) { size_t numCols = b.width_; CHECK_EQ(height_, numRows); CHECK_EQ(width_, 1UL); - aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset, - false_type(), true_type() /*aAsColVector*/); + aggregate(agg, + base::unary::identity(), + sv, + b, + numRows, + numCols, + offset, + false_type(), + true_type() /*aAsColVector*/); return 0; } -template<> +template <> template -int BaseMatrixT::applyRow( - Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) { +int BaseMatrixT::applyRow(Agg agg, + real scaleDest, + real scaleAgg, + BaseMatrixT& b) { if (scaleDest != 0) { applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b); } else { @@ -1521,10 +1744,10 @@ int BaseMatrixT::applyRow( return 0; } -template<> +template <> template -int BaseMatrixT::applyRow(Agg agg, Op op, Saver sv, - BaseMatrixT& b, BaseMatrixT& c) { +int BaseMatrixT::applyRow( + Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, 0); size_t numRows = b.height_; size_t numCols = b.width_; @@ -1532,16 +1755,27 @@ int BaseMatrixT::applyRow(Agg agg, Op op, Saver sv, CHECK_EQ(width_, 1UL); CHECK_EQ(c.height_, numRows); CHECK_EQ(c.width_, numCols); - aggregate(agg, op, sv, - b, c, numRows, numCols, offset, - false_type(), true_type() /*aAsColVector*/); + aggregate(agg, + op, + sv, + b, + c, + numRows, + numCols, + offset, + false_type(), + true_type() /*aAsColVector*/); return 0; } -template<> +template <> template -int BaseMatrixT::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg, - BaseMatrixT& b, BaseMatrixT& c) { +int BaseMatrixT::applyRow(Agg agg, + Op op, + real scaleDest, + real scaleAgg, + BaseMatrixT& b, + BaseMatrixT& c) { if (scaleDest != 0) { applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c); } else { @@ -1553,7 +1787,7 @@ int BaseMatrixT::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg, return 0; } -template<> +template <> template int BaseMatrixT::applyCol(Agg agg, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); @@ -1561,13 +1795,20 @@ int BaseMatrixT::applyCol(Agg agg, BaseMatrixT& b) { size_t numCols = b.width_; CHECK_EQ(width_, numCols); CHECK_EQ(height_, 1UL); - aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows, - numCols, offset, true_type() /*aAsRowVector*/, false_type()); + aggregate(agg, + base::unary::identity(), + base::binary::second(), + b, + numRows, + numCols, + offset, + true_type() /*aAsRowVector*/, + false_type()); return 0; } -template<> +template <> template int BaseMatrixT::applyCol(Agg agg, Saver sv, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); @@ -1575,16 +1816,25 @@ int BaseMatrixT::applyCol(Agg agg, Saver sv, BaseMatrixT& b) { size_t numCols = b.width_; CHECK_EQ(width_, numCols); CHECK_EQ(height_, 1UL); - aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset, - true_type() /*aAsRowVector*/, false_type()); + aggregate(agg, + base::unary::identity(), + sv, + b, + numRows, + numCols, + offset, + true_type() /*aAsRowVector*/, + false_type()); return 0; } -template<> +template <> template -int BaseMatrixT::applyCol( - Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) { +int BaseMatrixT::applyCol(Agg agg, + real scaleDest, + real scaleAgg, + BaseMatrixT& b) { if (scaleDest != 0) { applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b); } else { @@ -1596,48 +1846,51 @@ int BaseMatrixT::applyCol( return 0; } -template<> +template <> void BaseMatrixT::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) { applyRow(aggregate::sum(), scaleDest, scaleSum, b); } -template<> +template <> void BaseMatrixT::maxRows(BaseMatrixT& b) { applyRow(aggregate::max(), b); } -template<> +template <> void BaseMatrixT::minRows(BaseMatrixT& b) { applyRow(aggregate::min(), b); } -template<> +template <> void BaseMatrixT::maxCols(BaseMatrixT& b) { applyCol(aggregate::max(), b); } -template<> +template <> void BaseMatrixT::minCols(BaseMatrixT& b) { applyCol(aggregate::min(), b); } -template<> +template <> void BaseMatrixT::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) { applyCol(aggregate::sum(), scaleDest, scaleSum, b); } -template<> -void BaseMatrixT::sumOfSquaredDiffs( - BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) { - applyRow(aggregate::sum(), base::binary::squaredDiff(), - scaleDest, scaleSum, b, c); +template <> +void BaseMatrixT::sumOfSquaredDiffs(BaseMatrixT& b, + BaseMatrixT& c, + real scaleSum, + real scaleDest) { + applyRow( + aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c); } -template<> -void BaseMatrixT::sumOfProducts( - BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) { - applyRow(aggregate::sum(), base::binary::mul(), - scaleDest, scaleSum, b, c); +template <> +void BaseMatrixT::sumOfProducts(BaseMatrixT& b, + BaseMatrixT& c, + real scaleSum, + real scaleDest) { + applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c); } template class BaseMatrixT; diff --git a/paddle/math/TrainingAlgorithmOp.cu b/paddle/math/TrainingAlgorithmOp.cu index 72ff077270382d52bfcd340cc64d9abf49d1705d..fc746b85339de596d5ddc5811a8164094c13f63f 100644 --- a/paddle/math/TrainingAlgorithmOp.cu +++ b/paddle/math/TrainingAlgorithmOp.cu @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/utils/Logging.h" #include "BaseMatrix.h" #include "TrainingAlgorithmOp.h" +#include "paddle/utils/Logging.h" #if __cplusplus > 199711L @@ -32,10 +32,10 @@ void sparseMomentumApply(BaseMatrix& value, real tau, real learningRate) { auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad); - auto expr2 = momV.lazyAssign( - momV + (tau * alpha * gamma * learningRate) * grad); - auto expr3 = value.lazyAssign( - (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV); + auto expr2 = + momV.lazyAssign(momV + (tau * alpha * gamma * learningRate) * grad); + auto expr3 = value.lazyAssign((tau / beta + (real)1 / alpha) * momU + + ((real)1 / beta) * momV); AssignEvaluate(expr1, expr2, expr3); } @@ -52,12 +52,12 @@ void adadeltaApply(BaseMatrix& value, real momentum, real decayRate) { auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square()); - auto expr2 = lr.lazyAssign( - ((accum_update + epsilon) / (accum + epsilon)).sqrt()); - auto expr3 = accum_update.lazyAssign( - rou * accum_update + ((real)1 - rou) * (grad * lr).square()); - auto expr4 = mom.lazyAssign( - mom * momentum - learningRate * lr * (grad + value * decayRate)); + auto expr2 = + lr.lazyAssign(((accum_update + epsilon) / (accum + epsilon)).sqrt()); + auto expr3 = accum_update.lazyAssign(rou * accum_update + + ((real)1 - rou) * (grad * lr).square()); + auto expr4 = mom.lazyAssign(mom * momentum - + learningRate * lr * (grad + value * decayRate)); auto expr5 = value.lazyAssign(value + mom); AssignEvaluate(expr1, expr2, expr3, expr4, expr5); @@ -74,10 +74,10 @@ void adagradApply(BaseMatrix& value, real momentum, real decayRate) { auto expr1 = accum.lazyAssign(accum + grad.square()); - auto expr2 = lr.lazyAssign( - (accum_buffer + accum + epsilon).sqrt().reciprocal()); - auto expr3 = mom.lazyAssign( - mom * momentum - learningRate * lr * (grad + value * decayRate)); + auto expr2 = + lr.lazyAssign((accum_buffer + accum + epsilon).sqrt().reciprocal()); + auto expr3 = mom.lazyAssign(mom * momentum - + learningRate * lr * (grad + value * decayRate)); auto expr4 = value.lazyAssign(value + mom); AssignEvaluate(expr1, expr2, expr3, expr4); @@ -98,8 +98,8 @@ void rmspropApply(BaseMatrix& value, bool firstTime) { auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad); auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal()); - auto expr4 = mom.lazyAssign( - mom * momentum - learningRate * lr * (grad + value * decayRate)); + auto expr4 = mom.lazyAssign(mom * momentum - + learningRate * lr * (grad + value * decayRate)); auto expr5 = value.lazyAssign(value + mom); if (firstTime) { @@ -107,8 +107,8 @@ void rmspropApply(BaseMatrix& value, AssignEvaluate(expr1, expr2, expr3, expr4, expr5); } else { - auto expr1 = g.lazyAssign( - accumulatedRou * g + ((real)1 - rou) * grad.square()); + auto expr1 = + g.lazyAssign(accumulatedRou * g + ((real)1 - rou) * grad.square()); AssignEvaluate(expr1, expr2, expr3, expr4, expr5); } @@ -127,8 +127,8 @@ void decayedAdagradApply(BaseMatrix& value, real decayRate, bool firstTime) { auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal()); - auto expr3 = mom.lazyAssign( - mom * momentum - learningRate * lr * (grad + value * decayRate)); + auto expr3 = mom.lazyAssign(mom * momentum - + learningRate * lr * (grad + value * decayRate)); auto expr4 = value.lazyAssign(value + mom); if (firstTime) { @@ -136,8 +136,8 @@ void decayedAdagradApply(BaseMatrix& value, AssignEvaluate(expr1, expr2, expr3, expr4); } else { - auto expr1 = accum.lazyAssign( - accumulatedRou * accum + ((real)1 - rou) * grad.square()); + auto expr1 = accum.lazyAssign(accumulatedRou * accum + + ((real)1 - rou) * grad.square()); AssignEvaluate(expr1, expr2, expr3, expr4); } @@ -153,13 +153,12 @@ void adamApply(BaseMatrix& value, real beta2_power, real epsilon, real learningRate) { - real alpha = learningRate * - std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); + real alpha = + learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad); auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square()); - auto expr3 = value.lazyAssign( - value - (mom * alpha) / (v.sqrt() + epsilon)); + auto expr3 = value.lazyAssign(value - (mom * alpha) / (v.sqrt() + epsilon)); AssignEvaluate(expr1, expr2, expr3); } @@ -173,10 +172,10 @@ void adamaxApply(BaseMatrix& value, int64_t step, real alpha) { auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad); - auto expr2 = u.lazyAssign( - (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs())); + auto expr2 = + u.lazyAssign((beta2 * u > grad.abs()).condition(beta2 * u, grad.abs())); auto expr3 = value.lazyAssign( - value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u)); + value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u)); AssignEvaluate(expr1, expr2, expr3); } @@ -322,8 +321,8 @@ void adamApply(BaseMatrix& value, real beta2_power, real epsilon, real learningRate) { - real alpha = learningRate * - std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); + real alpha = + learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t; mom = beta1 * mom + ((real)1 - beta1) * grad; @@ -331,7 +330,7 @@ void adamApply(BaseMatrix& value, // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2 v = beta2 * v + ((real)1 - beta2) * grad.square(); - value -= (mom * alpha) / (v.sqrt() + epsilon); + value -= (mom * alpha) / (v.sqrt() + epsilon); } void adamaxApply(BaseMatrix& value, diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu index 40e38434fa328bba8be6e1b8e509023d615899c1..31b693afa8bd50f77a8efb67769e6215dd755bd3 100644 --- a/paddle/math/tests/test_Tensor.cu +++ b/paddle/math/tests/test_Tensor.cu @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include "paddle/math/Matrix.h" #include "TensorCheck.h" +#include "paddle/math/Matrix.h" using paddle::Matrix; using paddle::CpuMatrix; @@ -26,25 +26,25 @@ using paddle::GpuIVector; using autotest::TensorCheckEqual; using autotest::TensorCheckErr; -#define INIT_UNARY(A1, A2) \ - Tensor A1(height, width); \ - Tensor A2(height, width); \ - A1.randomizeUniform(); \ - A2.copyFrom(A1) -#define INIT_BINARY(A1, A2, B) \ - INIT_UNARY(A1, A2); \ - Tensor B(height, width); \ - B.randomizeUniform() -#define INIT_TERNARY(A1, A2, B, C) \ - INIT_BINARY(A1, A2, B); \ - Tensor C(height, width); \ - C.randomizeUniform() -#define INIT_QUATERNARY(A1, A2, B, C, D) \ - INIT_TERNARY(A1, A2, B, C); \ - Tensor D(height, width); \ - D.randomizeUniform() - -template +#define INIT_UNARY(A1, A2) \ + Tensor A1(height, width); \ + Tensor A2(height, width); \ + A1.randomizeUniform(); \ + A2.copyFrom(A1) +#define INIT_BINARY(A1, A2, B) \ + INIT_UNARY(A1, A2); \ + Tensor B(height, width); \ + B.randomizeUniform() +#define INIT_TERNARY(A1, A2, B, C) \ + INIT_BINARY(A1, A2, B); \ + Tensor C(height, width); \ + C.randomizeUniform() +#define INIT_QUATERNARY(A1, A2, B, C, D) \ + INIT_TERNARY(A1, A2, B, C); \ + Tensor D(height, width); \ + D.randomizeUniform() + +template struct TestUnaryMatrix { typedef std::function UnaryFunc; @@ -59,7 +59,7 @@ struct TestUnaryMatrix { } }; -template +template struct TestBinaryMatrix { typedef std::function BinaryFunc; @@ -74,10 +74,10 @@ struct TestBinaryMatrix { } }; -template +template struct TestTernaryMatrix { - typedef std::function TernaryFunc; + typedef std::function + TernaryFunc; explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) { for (auto height : {1, 11, 73, 128, 200, 330}) { @@ -90,10 +90,11 @@ struct TestTernaryMatrix { } }; -template +template struct TestQuaternaryMatrix { typedef std::function QuaternaryFunc; + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)> + QuaternaryFunc; explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) { for (auto height : {1, 11, 73, 128, 200, 330}) { @@ -106,7 +107,7 @@ struct TestQuaternaryMatrix { } }; -template +template struct TestUnaryVectorT { typedef std::function UnaryFunc; @@ -142,11 +143,11 @@ void SetTensorValue(Matrix& matrix, real value) { } } -template +template void testTensorAddScalar(Tensor& A1, Tensor& A2) { real p1 = 2.5; real p2 = 3.0; - A1.add(p1); // a += p + A1.add(p1); // a += p A2 += p1; TensorCheckEqual(A1, A2); @@ -155,7 +156,7 @@ void testTensorAddScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorSubScalar(Tensor& A1, Tensor& A2) { real p = 2.5; A1.subScalar(p); // a -= p @@ -163,7 +164,7 @@ void testTensorSubScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorMulScalar(Tensor& A1, Tensor& A2) { real p = 2.5; A1.mulScalar(p); // a *= p @@ -177,7 +178,7 @@ void testTensorMulScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorDivScalar(Tensor& A1, Tensor& A2) { real p = 2.5; A1.divScalar(p); // a /= p @@ -185,44 +186,44 @@ void testTensorDivScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorNeg(Tensor& A1, Tensor& A2) { A1.neg(); // a = -a A2 = -A2; TensorCheckEqual(A1, A2); } -template +template void testTensorAbs(Tensor& A1, Tensor& A2) { A1.abs2(); // a = a > 0 ? a : -a A2 = A2.abs(); TensorCheckEqual(A1, A2); } -template +template void testTensorSquare(Tensor& A1, Tensor& A2) { A1.square2(); // a = a * a A2 = A2.square(); TensorCheckEqual(A1, A2); } -template +template void testTensorReciprocal(Tensor& A1, Tensor& A2) { A1.reciprocal2(); // a = 1.0f / a A2 = A2.reciprocal(); TensorCheckEqual(A1, A2); } -template +template void testTensorSign(Tensor& A1, Tensor& A2) { A1.sign2(); // a = (a > 0) - (a < 0) A2 = A2.sign(); TensorCheckEqual(A1, A2); } -template +template void testTensorAssign(Tensor& A1, Tensor& A2) { - A1.assign(1.5); // a = p + A1.assign(1.5); // a = p A2 = A2.constant(1.5); TensorCheckEqual(A1, A2); @@ -235,7 +236,7 @@ void testTensorAssign(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testUnaryBaseOp(Tensor& A1, Tensor& A2) { testTensorAddScalar(A1, A2); testTensorSubScalar(A1, A2); @@ -249,9 +250,9 @@ void testUnaryBaseOp(Tensor& A1, Tensor& A2) { testTensorAssign(A1, A2); } -template +template void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) { - A1.add(2); // a += p + A1.add(2); // a += p A2 += 2; TensorCheckEqual(A1, A2); @@ -266,46 +267,46 @@ void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) { TEST(Unary, BaseOp) { TestUnaryMatrix testCpuMatrix(testUnaryBaseOp); TestUnaryVectorT testCpuVector(testUnaryBaseOp); - TestUnaryVectorT - testCpuIVector(testUnaryBaseOpInt); + TestUnaryVectorT testCpuIVector( + testUnaryBaseOpInt); #ifndef PADDLE_ONLY_CPU TestUnaryMatrix testGpuMatrix(testUnaryBaseOp); TestUnaryVectorT testGpuVector(testUnaryBaseOp); - TestUnaryVectorT - testGpuIVector(testUnaryBaseOpInt); + TestUnaryVectorT testGpuIVector( + testUnaryBaseOpInt); #endif } -template +template void testTensorExp(Tensor& A1, Tensor& A2) { A1.exp2(); // a = exp(a) A2 = A2.exp(); TensorCheckErr(A1, A2); } -template +template void testTensorLog(Tensor& A1, Tensor& A2) { A1.log2(); // a = log(a) A2 = A2.log(); TensorCheckErr(A1, A2); } -template +template void testTensorSqrt(Tensor& A1, Tensor& A2) { A1.sqrt2(); // a = sqrt(a) A2 = A2.sqrt(); TensorCheckErr(A1, A2); } -template +template void testTensorPow(Tensor& A1, Tensor& A2) { A1.pow2(3.2); // a = pow(a, p) A2 = A2.pow(3.2); TensorCheckErr(A1, A2); } -template +template void testUnayrMathOp(Tensor& A1, Tensor& A2) { testTensorExp(A1, A2); testTensorLog(A1, A2); @@ -321,7 +322,7 @@ TEST(Unary, MathOp) { #endif } -template +template void testTensorClip(Tensor& A1, Tensor& A2) { real p1 = 0.003f; real p2 = 0.877f; @@ -331,7 +332,7 @@ void testTensorClip(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) { real p = 0.5f; A1.biggerThanScalar(p); // a = a > p ? 1.0f : 0.0f @@ -339,7 +340,7 @@ void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) { TensorCheckEqual(A1, A2); } -template +template void testTensorapplyL1(Tensor& A1, Tensor& A2) { /** * T lambda = p; @@ -351,14 +352,15 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2) { real learningRate = 0.7f; real decayRate = 0.6f; A1.applyL1(learningRate, decayRate); - A2 = (A2 > (learningRate * decayRate)).condition( - (A2 - (learningRate * decayRate)), - (A2 < -(learningRate * decayRate)).condition( - (A2 + (learningRate * decayRate)), (real)0.0)); + A2 = (A2 > (learningRate * decayRate)) + .condition( + (A2 - (learningRate * decayRate)), + (A2 < -(learningRate * decayRate)) + .condition((A2 + (learningRate * decayRate)), (real)0.0)); TensorCheckEqual(A1, A2); } -template +template void testUnayrCompareOp(Tensor& A1, Tensor& A2) { testTensorClip(A1, A2); testTensorBiggerThanScalar(A1, A2); @@ -377,7 +379,7 @@ TEST(Unary, CompareOp) { #endif } -template +template void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) { real p1 = 2.5; real p2 = 3.2; @@ -406,7 +408,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) { real p = 2.5; A1.sub(B); // a -= b @@ -422,7 +424,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) { real p = 2.5; A1.mulScalar(B, p); // a = b * p @@ -442,7 +444,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) { real p = 2.5; A1.divScalar(B, p); // a = b / p @@ -454,28 +456,28 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) { A1.assign(B); // a = b A2 = B; TensorCheckEqual(A1, A2); } -template +template void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) { - B.square2(A1); // b = a * a + B.square2(A1); // b = a * a A2 = B.square(); TensorCheckEqual(A1, A2); } -template +template void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.squareDerivative(B); // a *= 2.0 * b A2 = A2 * (real)2.0 * B; TensorCheckEqual(A1, A2); } -template +template void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) { B.reciprocal2(A1); // b = 1.0f / a A2 = B.reciprocal(); @@ -490,33 +492,33 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) { real learningRate = 0.7f; real decayRate = 1.2f; A1.applyL2(B, learningRate, decayRate); // a *= (1.0f / (1.0f + p * b)) - A2 *= (B.constant(1.0f) + - B.constant(learningRate * decayRate) * B).reciprocal(); + A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B) + .reciprocal(); TensorCheckEqual(A1, A2); } -template +template void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.reciprocalDerivative(B); // a *= -b * b A2 *= (-B) * B; TensorCheckEqual(A1, A2); } -template +template void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) { B.sign2(A1); // b = a > 0.0f ? 1.0f : -1.0f A2 = B.sign(); TensorCheckEqual(A1, A2); } -template +template void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) { B.abs2(A1); // b = a > 0.0f ? a : -a A2 = B.abs(); TensorCheckEqual(A1, A2); } -template +template void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) { testTensorAdd(A1, A2, B); testTensorSub(A1, A2, B); @@ -539,7 +541,7 @@ TEST(Binary, BaseOp) { #endif } -template +template void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) { // a = exp(b) A1.exp2(B); @@ -547,14 +549,14 @@ void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckErr(A1, A2); } -template +template void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.expDerivative(B); // a *= b A2 *= B; TensorCheckEqual(A1, A2); } -template +template void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) { // a = log(b) A1.log2(B); @@ -562,7 +564,7 @@ void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckErr(A1, A2); } -template +template void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) { // a = sqrt(b) A1.sqrt2(B); @@ -570,7 +572,7 @@ void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckErr(A1, A2); } -template +template void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) { // a = 1.0f / sqrt(b) A1.invSqrt(B); @@ -578,14 +580,14 @@ void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckErr(A1, A2); } -template +template void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) { A1.pow2(B, 2.5f); // a = pow(b, p) A2 = B.pow(2.5f); TensorCheckErr(A1, A2); } -template +template void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) { /* * const T THRESHOLD = 40.0; @@ -597,12 +599,14 @@ void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) { real THRESHOLD = 40.0; A2 = (B.constant(1.0f) + - (B > THRESHOLD).condition( - THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)).exp()).log(); + (B > THRESHOLD) + .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)) + .exp()) + .log(); TensorCheckErr(A1, A2); } -template +template void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { /* * const T THRESHOLD = 40.0; @@ -612,14 +616,16 @@ void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { */ A1.softreluDerivative(B); real THRESHOLD = 40.0; - A2 = A2 * (B.constant(1.0f) - - (B.constant(-1.0f) * - (B > THRESHOLD).condition( - THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))).exp()); + A2 = A2 * + (B.constant(1.0f) - + (B.constant(-1.0f) * + (B > THRESHOLD) + .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))) + .exp()); TensorCheckErr(A1, A2); } -template +template void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) { /* const T THRESHOLD_MIN = -40.0; @@ -632,46 +638,47 @@ void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) { const real THRESHOLD_MIN = -40.0; const real THRESHOLD_MAX = 13.0; - auto tmp = (B < THRESHOLD_MIN).condition( - THRESHOLD_MIN, (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B)); + auto tmp = (B < THRESHOLD_MIN) + .condition(THRESHOLD_MIN, + (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B)); A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal(); TensorCheckErr(A1, A2); } -template +template void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.sigmoidDerivative(B); // a *= b * (1 - b) A2 *= B * (B.constant(1.0f) - B); TensorCheckEqual(A1, A2); } -template +template void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) { B.tanh(A1); // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0 A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f; TensorCheckErr(A1, A2); } -template +template void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.tanhDerivative(B); // a *= 1 - b * b A2 *= B.constant(1.0f) - B * B; TensorCheckEqual(A1, A2); } -template +template void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) { real p1 = 2.5; real p2 = 3.1; // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0) B.scaledTanh(A1, p1, p2); A2 = B.constant(p1) * - (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) - - (real)1.0); + (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) - + (real)1.0); TensorCheckErr(A1, A2); } -template +template void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) { real p1 = 2.5; real p2 = 3.1; @@ -681,7 +688,7 @@ void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) { testTensorTanhDerivative(A1, A2, B); testTensorScaledTanhDerivative(A1, A2, B); @@ -708,21 +715,21 @@ TEST(Binary, MathOp) { #endif } -template +template void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) { B.relu(A1); // b = a > 0.0f ? a : 0.0f A2 = (B > (real)0.0f).condition(B, (real)0.0f); TensorCheckEqual(A1, A2); } -template +template void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.reluDerivative(B); // a *= (b > 0.0f ? 1.0f : 0.0f) A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0); TensorCheckEqual(A1, A2); } -template +template void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) { /* * b = a > p1 ? a : p1 @@ -736,7 +743,7 @@ void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { SetTensorValue(B, 32.0f); /* @@ -748,15 +755,15 @@ void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) { A1.absDerivative(B); // a = (b > 0) ? a : (b < 0) ? -a : 0 - A2 = (B > (real)0.0f).condition(A2, - (B < (real)0.0f).condition(-A2, (real)0.0f)); + A2 = (B > (real)0.0f) + .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f)); TensorCheckEqual(A1, A2); } -template +template void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) { real p = 0.613; SetTensorValue(B, p); @@ -765,7 +772,7 @@ void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) { TensorCheckEqual(A1, A2); } -template +template void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) { /** * T lambda = p * b; @@ -778,12 +785,13 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) { real decayRate = 0.6f; A1.applyL1(B, learningRate, decayRate); auto lambda = B.constant(learningRate * decayRate) * B; - A2 = (A2 > lambda).condition( - (A2 - lambda), (A2 < -lambda).condition((A2 + lambda), (real)0.0f)); + A2 = (A2 > lambda) + .condition((A2 - lambda), + (A2 < -lambda).condition((A2 + lambda), (real)0.0f)); TensorCheckEqual(A1, A2); } -template +template void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) { B.subScalar(0.5f); SetTensorValue(B, 0.0f); @@ -807,7 +815,7 @@ TEST(Binary, CompareOp) { #endif } -template +template void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.add(B, C); // a = b + c A2 = B + C; @@ -833,7 +841,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.sub(B, C); // a = b - c A2 = B - C; @@ -846,7 +854,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.dotMul(B, C); // a = b * c A2 = B * C; @@ -892,7 +900,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.dotDiv(B, C); // a = (b == 0.0) ? 0.0 : b / c A2 = (B == (real)0.0).condition((real)0.0, B / C); @@ -905,7 +913,7 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { real p1 = 1.5; real p2 = 2.5; @@ -915,14 +923,14 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { TensorCheckEqual(A1, A2); } -template +template void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.softCrossEntropy(B, C); // a = -c * log(b) - (1 - c) * log(1 - b) A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log(); TensorCheckErr(A1, A2); } -template +template void testTensorSoftCrossEntropyBp(Tensor& A1, Tensor& A2, Tensor& B, @@ -932,7 +940,7 @@ void testTensorSoftCrossEntropyBp(Tensor& A1, TensorCheckEqual(A1, A2); } -template +template void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { testTensorAdd(A1, A2, B, C); testTensorSub(A1, A2, B, C); @@ -952,30 +960,30 @@ TEST(Ternary, BaseOp) { #endif } -template +template void testTensorBinaryLabelCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.binaryLabelCrossEntropy(B, C); // a = c > 0.5 ? -log(b) : -log(1.0 - b) - A2 = (C > (real)0.5).condition( - -(B.log()), -((B.constant(1.0f) - B).log())); + A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log())); TensorCheckErr(A1, A2); } -template +template void testTensorBinaryLabelCrossEntropyBp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b) A1.binaryLabelCrossEntropyBp(B, C); - A2 += (C > (real)0.5).condition( - (B.constant(-1.0f) / B), (B.constant(1.0f) - B).reciprocal()); + A2 += (C > (real)0.5) + .condition((B.constant(-1.0f) / B), + (B.constant(1.0f) - B).reciprocal()); TensorCheckErr(A1, A2); } -template +template void testTensorLogisticRegressionLoss(Tensor& A1, Tensor& A2, Tensor& B, @@ -991,13 +999,14 @@ void testTensorLogisticRegressionLoss(Tensor& A1, */ A1.logisticRegressionLoss(B, C); real THRESHOLD = 40.0; - auto tmp = (B > THRESHOLD).condition( - THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); + auto tmp = + (B > THRESHOLD) + .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp; TensorCheckErr(A1, A2); } -template +template void testTensorLogisticRegressionLossBp(Tensor& A1, Tensor& A2, Tensor& B, @@ -1013,28 +1022,29 @@ void testTensorLogisticRegressionLossBp(Tensor& A1, */ A1.logisticRegressionLossBp(B, C); real THRESHOLD = 40.0; - auto tmp = (B > THRESHOLD).condition( - THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); + auto tmp = + (B > THRESHOLD) + .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); auto tmp2 = tmp.exp(); A2 = tmp2 / (C.constant(1.0) + tmp2) - C; TensorCheckErr(A1, A2); } -template +template void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.biggerThan(B, C); // a = (b > c) ? 1.0f : 0.0f A2 = (B > C).condition((real)1.0f, (real)0.0f); TensorCheckEqual(A1, A2); } -template +template void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { A1.max2(B, C); // a = (b > c) ? b : c A2 = (B > C).condition(B, C); TensorCheckEqual(A1, A2); } -template +template void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C); testTensorBinaryLabelCrossEntropy(A1, A2, B, C); @@ -1053,12 +1063,9 @@ TEST(Ternary, CompareOp) { #endif } -template -void testQuaternaryAdd(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testQuaternaryAdd( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f); // a = p1 * b + p2 * c + p3 * d // A2 = B * 1.5f + C * 2.5f + D * 3.5f; // TensorCheckEqual(A1, A2); @@ -1084,25 +1091,19 @@ TEST(Quaternary, BaseOp) { #endif } -template -void testTensorBiggerThan(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testTensorBiggerThan( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f); A1.biggerThan(B, C, D); - A2 = ((B > C && D > (real)0.5) - || (B < C && D < (real)0.5)).condition((real)1.0, (real)0.0); + A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5)) + .condition((real)1.0, (real)0.0); TensorCheckEqual(A1, A2); } -template -void testTensorRankLoss(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testTensorRankLoss( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { /** * const T THRESHOLD = 40.0; a = b - c; * a = (a > THRESHOLD) @@ -1114,19 +1115,17 @@ void testTensorRankLoss(Tensor& A1, real THRESHOLD = 40.0; auto tmp = B - C; - auto tmp2 = (tmp > THRESHOLD).condition( - THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); + auto tmp2 = + (tmp > THRESHOLD) + .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D; TensorCheckErr(A1, A2); } -template -void testTensorRankLossBp(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testTensorRankLossBp( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { /** * const T THRESHOLD = 40.0; a = b - c; * a = (a > THRESHOLD) @@ -1137,20 +1136,18 @@ void testTensorRankLossBp(Tensor& A1, A1.rankLossBp(B, C, D); real THRESHOLD = 40.0; auto tmp = B - C; - auto tmp2 = (tmp > THRESHOLD).condition( - THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); + auto tmp2 = + (tmp > THRESHOLD) + .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); auto tmp3 = tmp2.exp(); A2 = tmp3 / (D.constant(1.0f) + tmp3) - D; TensorCheckErr(A1, A2); } -template -void testQuaternaryCompareOp(Tensor& A1, - Tensor& A2, - Tensor& B, - Tensor& C, - Tensor& D) { +template +void testQuaternaryCompareOp( + Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) { testTensorBiggerThan(A1, A2, B, C, D); testTensorRankLoss(A1, A2, B, C, D); testTensorRankLossBp(A1, A2, B, C, D); diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu index 786d863a533b58ea9856300aaa0cd8f5a10a4dd9..92afab4ff7f5ff4acc219c5ac783733340c5726a 100644 --- a/paddle/math/tests/test_lazyAssign.cu +++ b/paddle/math/tests/test_lazyAssign.cu @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "PerfUtils.h" +#include "TensorCheck.h" #include "paddle/math/Matrix.h" #include "paddle/math/TensorAssign.h" -#include "TensorCheck.h" -#include "PerfUtils.h" using paddle::BaseMatrix; using paddle::CpuMatrix; @@ -27,14 +27,28 @@ using autotest::TensorCheckErr; typedef std::function testMatrixFunc; void testMatrixCase(testMatrixFunc matrixFunc) { for (auto height : {1}) { - for (auto width : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072, - 262144, 524288, 1048576, 2097152, 4194304, 8388608}) { + for (auto width : {1, + 32, + 64, + 128, + 512, + 1024, + 4096, + 32768, + 65536, + 131072, + 262144, + 524288, + 1048576, + 2097152, + 4194304, + 8388608}) { matrixFunc(height, width); } } } -template +template void testLazyAssign(int height, int width) { Tensor A1(height, width); Tensor A2(height, width); @@ -49,40 +63,39 @@ void testLazyAssign(int height, int width) { EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;); - EXPRESSION_PERFORMANCE( - auto expr1 = A2.lazyAssign(B + C); - auto expr2 = A2.lazyAssign(A2 * D); - AssignEvaluate(expr1, expr2);); + EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C); + auto expr2 = A2.lazyAssign(A2 * D); + AssignEvaluate(expr1, expr2);); TensorCheckErr(A1, A2); } -TEST(lazyAssign, CPU) { - testMatrixCase(testLazyAssign); -} +TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign); } #ifndef PADDLE_ONLY_CPU -TEST(lazyAssign, GPU) { - testMatrixCase(testLazyAssign); -} +TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign); } #endif -template -void sgdUpdateTensor(Tensor& A, Tensor& B, Tensor& C, Tensor& D, - real p1, real p2, real p3) { +template +void sgdUpdateTensor( + Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) { C = C * p2 - D * (B + A * p3) * p1; A += C; } -void sgdUpdateLazyAssign(BaseMatrix& A, BaseMatrix& B, - BaseMatrix& C, BaseMatrix& D, - real p1, real p2, real p3) { +void sgdUpdateLazyAssign(BaseMatrix& A, + BaseMatrix& B, + BaseMatrix& C, + BaseMatrix& D, + real p1, + real p2, + real p3) { auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1); auto expr2 = A.lazyAssign(A + C); AssignEvaluate(expr1, expr2); } -template +template void testSgdUpdate(int height, int width) { Tensor A1(height, width); Tensor A2(height, width); @@ -113,16 +126,13 @@ void testSgdUpdate(int height, int width) { * a = a + c; */ // BaseMatrix API - EXPRESSION_PERFORMANCE( - A1.sgdUpdate(B, C1, D, p1, p2, p3);); + EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3);); // Tensor expression - EXPRESSION_PERFORMANCE( - sgdUpdateTensor(A2, B, C2, D, p1, p2, p3)); + EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3)); // lazyAssign - EXPRESSION_PERFORMANCE( - sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3)); + EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3)); TensorCheckErr(A1, A2); TensorCheckErr(A1, A3); @@ -130,12 +140,8 @@ void testSgdUpdate(int height, int width) { TensorCheckErr(C1, C3); } -TEST(sgdUpdate, CPU) { - testMatrixCase(testSgdUpdate); -} +TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate); } #ifndef PADDLE_ONLY_CPU -TEST(sgdUpdate, GPU) { - testMatrixCase(testSgdUpdate); -} +TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate); } #endif diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu index 8c652213f2e4c0e0ea1a31987fcb37c86374cd2a..a7527ac2919580c0936316abd887a840f5793901 100644 --- a/paddle/operators/softmax_op.cu +++ b/paddle/operators/softmax_op.cu @@ -3,4 +3,5 @@ #include "paddle/operators/softmax_op.h" REGISTER_OP_GPU_KERNEL(softmax, ops::SoftmaxKernel); -REGISTER_OP_GPU_KERNEL(softmax_grad, ops::SoftmaxGradKernel); +REGISTER_OP_GPU_KERNEL(softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto index f189b21e86a50d70d317b5e43aa2d6e05af5e774..0bb215d92b4301030a630e43d98c9cab9bc173fe 100644 Binary files a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto and b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto differ diff --git a/proto/DataConfig.proto b/proto/DataConfig.proto index e895c184d9f95dba1449e6467a2566712837600b..0cb5d7afbb3e1cb4abe45c0ed677e09b27b870fa 100644 --- a/proto/DataConfig.proto +++ b/proto/DataConfig.proto @@ -15,14 +15,13 @@ syntax = "proto2"; package paddle; - message FileGroupConf { - optional uint32 queue_capacity = 1 [default = 1]; + optional uint32 queue_capacity = 1 [ default = 1 ]; // how many files to load for a load file thread - optional int32 load_file_count = 2 [default = 1]; + optional int32 load_file_count = 2 [ default = 1 ]; // how many threads to load files // Setting to be 5~10 is appropriate when loading files by hadoop vfs - optional int32 load_thread_num = 3 [default = 1]; + optional int32 load_thread_num = 3 [ default = 1 ]; }; message DataConfig { @@ -32,26 +31,28 @@ message DataConfig { // name of a text file which contains a list of file names at each line optional string files = 3; - optional int32 feat_dim = 4;//feature dimension of one frame - repeated int32 slot_dims = 5;//feature slot dims - optional int32 context_len = 6;//max neibour frame numbers - optional uint64 buffer_capacity = 7;//the number of samples + optional int32 feat_dim = 4; // feature dimension of one frame + repeated int32 slot_dims = 5; // feature slot dims + optional int32 context_len = 6; // max neibour frame numbers + optional uint64 buffer_capacity = 7; // the number of samples - //part of data used in training - //if not -1, part of train data is used in training - optional int64 train_sample_num = 8 [default = -1]; + // part of data used in training + // if not -1, part of train data is used in training + optional int64 train_sample_num = 8 [ default = -1 ]; - //The number of documents processed once - optional int32 file_load_num = 9 [default = -1]; - optional bool async_load_data = 12 [default = false]; + // The number of documents processed once + optional int32 file_load_num = 9 [ default = -1 ]; + optional bool async_load_data = 12 [ default = false ]; /// Note the field number 10, 11 and 13 have been deprecated. - optional bool for_test = 14 [default = false]; // whether this data is for test + optional bool for_test = 14 + [ default = false ]; // whether this data is for test optional FileGroupConf file_group_conf = 15; repeated int32 float_slot_dims = 16; /// Note the field number 17, 18 and 19 have been deprecated. - // a list of values which will be used to create additional one dimensional float + // a list of values which will be used to create additional one dimensional + // float // values slots. These one dimensional slots can be used as the weight input // for cost layers. // Currently this is only supported by ProtoDataProvider. @@ -65,21 +66,21 @@ message DataConfig { // for MultiDataProvider repeated DataConfig sub_data_configs = 24; // sub dataproviders - /* - * the ratio of each sub dataproviders: - * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100, - * then each mini-batch is combined by 10 instance from A and 90 instances - * from B. - */ + /* + * the ratio of each sub dataproviders: + * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100, + * then each mini-batch is combined by 10 instance from A and 90 instances + * from B. + */ optional int32 data_ratio = 25; /* * if one of the sub dataproviders is running out of data, then * (1) it is "main data", then finish current pass. * (2) it is not "main data", then reset it, and try getNextBatch again. */ - optional bool is_main_data = 26 [default = true]; + optional bool is_main_data = 26 [ default = true ]; - // the usage ratio of instances. Setting to 1.0 means the use of all instances. - optional double usage_ratio = 27 [default = 1.0]; + // the usage ratio of instances. Setting to 1.0 means the use of all + // instances. + optional double usage_ratio = 27 [ default = 1.0 ]; }; - diff --git a/proto/DataFormat.proto b/proto/DataFormat.proto index 19b1499b0281a1b92028cc8944c27ee4d56b8dd2..7d963bc29f7c6b9895323b0d57ba4ee4cb4387d0 100644 --- a/proto/DataFormat.proto +++ b/proto/DataFormat.proto @@ -17,27 +17,32 @@ package paddle; /* If values is not empty and ids is empty, this is a dense vector. - If values is not empty and ids is not empty, this is a sparse vector. The position of each value + If values is not empty and ids is not empty, this is a sparse vector. The + position of each value is specified by ids. - If values is empty and ids is not empty, this is a sparse vector whose non-zero values are 1. + If values is empty and ids is not empty, this is a sparse vector whose non-zero + values are 1. The position of each 1 is specified by ids. */ message VectorSlot { - repeated float values = 1 [packed = true]; - repeated uint32 ids = 2 [packed = true]; + repeated float values = 1 [ packed = true ]; + repeated uint32 ids = 2 [ packed = true ]; /* For multidimensional data, for example "image width height depth" */ - repeated uint32 dims = 3 [packed = true]; - repeated string strs = 4; + repeated uint32 dims = 3 [ packed = true ]; + repeated string strs = 4; }; /* - SubseqSlot use to record whether VectorSlot or any other slot in future has subseq. - If not all VectorSlot have subseq, we only store the one who has subseq, and use *slot_id* to record it. - One vector_slots has one sequence, and it may have N subseq, thus the number of *lens* will be N too. + SubseqSlot use to record whether VectorSlot or any other slot in future has + subseq. + If not all VectorSlot have subseq, we only store the one who has subseq, and + use *slot_id* to record it. + One vector_slots has one sequence, and it may have N subseq, thus the number of + *lens* will be N too. */ message SubseqSlot { - required uint32 slot_id = 1; //the id of slot who has subseq - repeated uint32 lens = 2; // lengths of sub-sequence in the slot + required uint32 slot_id = 1; // the id of slot who has subseq + repeated uint32 lens = 2; // lengths of sub-sequence in the slot }; message SlotDef { @@ -45,13 +50,14 @@ message SlotDef { VECTOR_DENSE = 0; VECTOR_SPARSE_NON_VALUE = 1; VECTOR_SPARSE_VALUE = 2; - INDEX = 3; // This can be used as label, or word id, etc. + INDEX = 3; // This can be used as label, or word id, etc. VAR_MDIM_DENSE = 4; VAR_MDIM_INDEX = 5; STRING = 6; } required SlotType type = 1; - required uint32 dim = 2; // For INDEX slots, this means the maximal index plus 1. + required uint32 dim = + 2; // For INDEX slots, this means the maximal index plus 1. }; message DataHeader { @@ -60,11 +66,11 @@ message DataHeader { }; message DataSample { - optional bool is_beginning = 1 [default = true]; // is the beginning of a sequence + optional bool is_beginning = 1 + [ default = true ]; // is the beginning of a sequence repeated VectorSlot vector_slots = 2; - repeated uint32 id_slots = 3 [packed = true]; + repeated uint32 id_slots = 3 [ packed = true ]; /* use ids of VectorSlot */ repeated VectorSlot var_id_slots = 4; repeated SubseqSlot subseq_slots = 5; }; - diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index b50b73c7e169f3e8ae75322d9a0a3cad5072a9c7..4f3d5bf3f6cb96c97285f40e3a3d100c2af47ad5 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -21,7 +21,6 @@ package paddle; * Various structs for the configuration of a neural network */ - message ExternalConfig { repeated string layer_names = 1; repeated string input_layer_names = 2; @@ -68,7 +67,7 @@ message ConvConfig { required uint32 img_size = 8; // caffe mode for output size coherence - required bool caffe_mode = 9 [default = true]; + required bool caffe_mode = 9 [ default = true ]; // if filter_size_y is set , this convolutional layer will use // filters of size filter_size * filter_size_y pixels. @@ -99,7 +98,7 @@ message PoolConfig { optional uint32 start = 4; // Defines the stride size between successive pooling squares. - required uint32 stride = 5 [default = 1]; + required uint32 stride = 5 [ default = 1 ]; // The size of output feature map. required uint32 output_x = 6; @@ -109,7 +108,7 @@ message PoolConfig { // padding = 4, instructs the net to implicitly // pad the images with a 4-pixel border of zeros. - optional uint32 padding = 8 [default = 0]; + optional uint32 padding = 8 [ default = 0 ]; // if not set, use size_x optional uint32 size_y = 9; @@ -194,9 +193,7 @@ message MaxOutConfig { required uint32 groups = 2; } -message RowConvConfig { - required uint32 context_length = 1; -} +message RowConvConfig { required uint32 context_length = 1; } message SliceConfig { required uint32 start = 1; @@ -212,14 +209,14 @@ message ProjectionConfig { // For ShiftProjection optional int32 context_start = 5; optional int32 context_length = 6; - optional bool trainable_padding = 7 [default = false]; + optional bool trainable_padding = 7 [ default = false ]; // For convolution optional ConvConfig conv_conf = 8; optional int32 num_filters = 9; // For IdentityOffsetProjection - optional uint64 offset = 11 [default = 0]; + optional uint64 offset = 11 [ default = 0 ]; // For pool optional PoolConfig pool_conf = 12; @@ -236,7 +233,7 @@ message OperatorConfig { required uint64 output_size = 4; // For DotMulOperator - optional double dotmul_scale = 5 [default = 1.0]; + optional double dotmul_scale = 5 [ default = 1.0 ]; // For ConvOperator optional ConvConfig conv_conf = 6; @@ -282,8 +279,8 @@ message MultiBoxLossConfig { required float neg_overlap = 4; required uint32 background_id = 5; required uint32 input_num = 6; - optional uint32 height = 7 [default = 1]; - optional uint32 width = 8 [default = 1]; + optional uint32 height = 7 [ default = 1 ]; + optional uint32 width = 8 [ default = 1 ]; } message DetectionOutputConfig { @@ -294,8 +291,8 @@ message DetectionOutputConfig { required uint32 input_num = 5; required uint32 keep_top_k = 6; required float confidence_threshold = 7; - optional uint32 height = 8 [default = 1]; - optional uint32 width = 9 [default = 1]; + optional uint32 height = 8 [ default = 1 ]; + optional uint32 width = 9 [ default = 1 ]; } message ClipConfig { @@ -331,7 +328,7 @@ message LayerConfig { required string name = 1; required string type = 2; optional uint64 size = 3; - //optional ActivationConfig activation = 4; + // optional ActivationConfig activation = 4; optional string active_type = 4; repeated LayerInputConfig inputs = 5; optional string bias_parameter_name = 6; @@ -344,7 +341,7 @@ message LayerConfig { // (which is how convnets are usually trained). Setting this to // false will untie the biases, yielding a separate bias for // every location at which the filter is applied. - optional bool shared_biases = 8 [default = false]; + optional bool shared_biases = 8 [ default = false ]; // Valid values are ones that divide the area of the output // grid in this convolutional layer. For example if this layer @@ -362,33 +359,35 @@ message LayerConfig { // the gpu device which the Layer's data in. // Only used by ParallelNeuralNetork. Ignored otherwise. - optional int32 device = 12 [default = -1]; + optional int32 device = 12 [ default = -1 ]; - // for recurrent layer. If true, the recurrence runs from the end to the beginning. - optional bool reversed = 13 [default = false]; + // for recurrent layer. If true, the recurrence runs from the end to the + // beginning. + optional bool reversed = 13 [ default = false ]; - // for lstmemory layer. Different types of nodes have different activation type. - optional string active_gate_type = 14; + // for lstmemory layer. Different types of nodes have different activation + // type. + optional string active_gate_type = 14; optional string active_state_type = 15; // For NCELayer // The number of random negative labels for each sample - optional int32 num_neg_samples = 16 [default = 10]; + optional int32 num_neg_samples = 16 [ default = 10 ]; // For NCELayer // The distribution for generating the random negative labels. // A uniform distribution will be used if not provided - repeated double neg_sampling_dist = 17 [packed = true]; + repeated double neg_sampling_dist = 17 [ packed = true ]; // For MaxLayer // default: output VALUE of MaxLayer. set this flag to true for output INDEX // INDEX will be put in Argument::value as double values. - optional bool output_max_index = 19 [default = false]; + optional bool output_max_index = 19 [ default = false ]; /// The filed number 20 have been deprecated. // For self-normalized estimation - optional double softmax_selfnorm_alpha = 21 [default = 0.1]; + optional double softmax_selfnorm_alpha = 21 [ default = 0.1 ]; /// The filed numbers 22 and 23 have been deprecated. @@ -399,14 +398,14 @@ message LayerConfig { optional bool norm_by_times = 25; // for CostLayers - optional double coeff = 26 [default = 1.0]; + optional double coeff = 26 [ default = 1.0 ]; // for AverageLayer // can be set to: 'average', 'sum' or 'squarerootn' optional string average_strategy = 27; // for error clipping - optional double error_clipping_threshold = 28 [default = 0.0]; + optional double error_clipping_threshold = 28 [ default = 0.0 ]; // for operators used by mixed layer repeated OperatorConfig operator_confs = 29; @@ -434,43 +433,44 @@ message LayerConfig { optional uint32 beam_size = 39; // for seqlastins layer, whether select first instead last - optional bool select_first = 40 [default = false]; + optional bool select_first = 40 [ default = false ]; // for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer // can be set to: 'non-seq','seq' - optional string trans_type = 41 [default = 'non-seq']; + optional string trans_type = 41 [ default = 'non-seq' ]; // to indicate whether selective_fc layer // is used in sequence generation or not - optional bool selective_fc_pass_generation = 42 [default = false]; + optional bool selective_fc_pass_generation = 42 [ default = false ]; // to indicate whether selective_fc layer take its last input to // selected several columns and only compute the multiplications // between the input matrices and the selected columns of // the parameter matrices of this layer. // if set false, selective_fc degrades into fc. - optional bool has_selected_colums = 43 [default = true]; + optional bool has_selected_colums = 43 [ default = true ]; // this parameter is for speed consideration. // if number of the selected columns is less than // sample number * selective_fc output size * selective_fc_mull_mull_ratio // sparse multiplication is used, otherwise, using full multiplication. - optional double selective_fc_full_mul_ratio = 44 [default = 0.02]; + optional double selective_fc_full_mul_ratio = 44 [ default = 0.02 ]; // to indicate how many threads selective_fc use to to accelate // the plain_mul period // leave empty or set to 0 to disable multi-thread accleleration - optional uint32 selective_fc_parallel_plain_mul_thread_num = 45 [default = 0]; + optional uint32 selective_fc_parallel_plain_mul_thread_num = 45 + [ default = 0 ]; // for batch normalization layer // if set use_global_stats true, will use the loaded mean and variance. optional bool use_global_stats = 46; // use to compute moving mean and variance. - optional double moving_average_fraction = 47 [default = 0.9]; + optional double moving_average_fraction = 47 [ default = 0.9 ]; // bias size - optional uint32 bias_size = 48 [default = 0]; + optional uint32 bias_size = 48 [ default = 0 ]; // this parameter can be used as a user-defined parameter when necessary, // without changing the proto file. @@ -485,18 +485,17 @@ message LayerConfig { optional uint64 width = 51; // blank label used in ctc loss - optional uint32 blank = 52 [default = 0]; + optional uint32 blank = 52 [ default = 0 ]; // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which // controls the scope of pooling operation. can be set > 0. // leave empty or set to -1 to disable this stride pooling. - optional int32 seq_pool_stride = 53 [default = -1]; + optional int32 seq_pool_stride = 53 [ default = -1 ]; // for crop layer - optional int32 axis = 54 [default = 2]; + optional int32 axis = 54 [ default = 2 ]; repeated uint32 offset = 55; repeated uint32 shape = 56; - } message EvaluatorConfig { @@ -512,9 +511,9 @@ message EvaluatorConfig { // Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator // For multi binary labels: true if output > classification_threshold - optional double classification_threshold = 6 [default = 0.5]; + optional double classification_threshold = 6 [ default = 0.5 ]; // The positive label. -1 means average precision and recall - optional int32 positive_label = 7 [default = -1]; + optional int32 positive_label = 7 [ default = -1 ]; // load dict from this file optional string dict_file = 8; @@ -523,10 +522,10 @@ message EvaluatorConfig { optional string result_file = 9; // top # results for max id printer - optional int32 num_results = 10 [default = 1]; + optional int32 num_results = 10 [ default = 1 ]; // whether to delimit the sequence in the seq_text_printer - optional bool delimited = 11 [default = true]; + optional bool delimited = 11 [ default = true ]; // Used by ChunkEvaluator // chunk of these types are not counted @@ -534,23 +533,23 @@ message EvaluatorConfig { // Used by ClassificationErrorEvaluator // top # classification error - optional int32 top_k = 13 [default = 1]; + optional int32 top_k = 13 [ default = 1 ]; // Used by DetectionMAPEvaluator - optional double overlap_threshold = 14 [default = 0.5]; + optional double overlap_threshold = 14 [ default = 0.5 ]; - optional int32 background_id = 15 [default = 0]; + optional int32 background_id = 15 [ default = 0 ]; - optional bool evaluate_difficult = 16 [default = false]; + optional bool evaluate_difficult = 16 [ default = false ]; - optional string ap_type = 17 [default = "11point"]; + optional string ap_type = 17 [ default = "11point" ]; } message LinkConfig { required string layer_name = 1; required string link_name = 2; // If true, this link has sub-sequence - optional bool has_subseq = 3 [default = false]; + optional bool has_subseq = 3 [ default = false ]; } message MemoryConfig { @@ -563,18 +562,18 @@ message MemoryConfig { optional uint32 boot_with_const_id = 7; // memory is a sequence, initailized by a sequence boot layer - optional bool is_sequence = 6 [default = false]; + optional bool is_sequence = 6 [ default = false ]; } message GeneratorConfig { required uint32 max_num_frames = 1; required string eos_layer_name = 2; - optional int32 num_results_per_sample = 3 [default = 1]; + optional int32 num_results_per_sample = 3 [ default = 1 ]; // for beam search - optional int32 beam_size = 4 [default = 1]; + optional int32 beam_size = 4 [ default = 1 ]; - optional bool log_prob = 5 [default = true]; + optional bool log_prob = 5 [ default = true ]; } message SubModelConfig { @@ -584,10 +583,10 @@ message SubModelConfig { repeated string output_layer_names = 4; repeated string evaluator_names = 5; - optional bool is_recurrent_layer_group = 6 [default = false]; + optional bool is_recurrent_layer_group = 6 [ default = false ]; // If true, the recurrence runs from the end to the beginning. - optional bool reversed = 7 [default = false]; + optional bool reversed = 7 [ default = false ]; // name and link name of memory repeated MemoryConfig memories = 8; @@ -601,14 +600,15 @@ message SubModelConfig { optional GeneratorConfig generator = 11; - // the id of inlink which share info with outlinks, used in recurrent layer group + // the id of inlink which share info with outlinks, used in recurrent layer + // group optional int32 target_inlinkid = 12; } message ModelConfig { // type of the model. // Currently, "nn", "recurrent_nn" and "recursive_nn" are supported - required string type = 1 [default = "nn"]; + required string type = 1 [ default = "nn" ]; // layers should be ordered in such a way that the forward propagation // can be correctly executed by going from the first layer to the last layer diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto index 2a87e293f64d3398dea2641c3ff292eceec7e154..d27b1bcf80045216a5807812d39f7a248a956076 100644 --- a/proto/OptimizerConfig.proto +++ b/proto/OptimizerConfig.proto @@ -1,5 +1,5 @@ syntax = "proto2"; - + option optimize_for = LITE_RUNTIME; package paddle; @@ -9,13 +9,11 @@ message SGDConfig { // momentum: float >= 0. Parameter updates momentum. // decay: float >= 0. Learning rate decay over each update. // nesterov: boolean. Whether to apply Nesterov momentum. - optional double momentum = 21 [default = 0.0]; - optional double decay = 23 [default = 0.0]; - optional bool nesterov =24 [default = false]; - + optional double momentum = 21 [ default = 0.0 ]; + optional double decay = 23 [ default = 0.0 ]; + optional bool nesterov = 24 [ default = false ]; } - message AdadeltaConfig { // Adadelta // It is recommended to leave it at the default value. @@ -23,21 +21,23 @@ message AdadeltaConfig { // epsilon: float >= 0. Fuzz factor. // decay: float >= 0. Learning rate decay over each update. - // reference : [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701) - optional double rho = 33 [default = 0.90]; - optional double epsilon = 31 [default = 1e-5]; - optional double decay = 32 [default = 0.0]; - + // reference : [Adadelta - an adaptive learning rate + // method](http://arxiv.org/abs/1212.5701) + optional double rho = 33 [ default = 0.90 ]; + optional double epsilon = 31 [ default = 1e-5 ]; + optional double decay = 32 [ default = 0.0 ]; } message AdagradConfig { -// Adagrad -// epsilon: float >= 0. -// decay: float >= 0. Learning rate decay over each update. + // Adagrad + // epsilon: float >= 0. + // decay: float >= 0. Learning rate decay over each update. -// reference : [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) - optional double epsilon = 41 [default = 1e-5]; - optional double decay = 42 [default = 0.0]; + // reference : [Adaptive Subgradient Methods for Online Learning and + // Stochastic + // Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) + optional double epsilon = 41 [ default = 1e-5 ]; + optional double decay = 42 [ default = 0.0 ]; } message AdamConfig { @@ -46,7 +46,8 @@ message AdamConfig { // beta_2: float, 0 < beta < 1. Generally close to 1. // epsilon: float >= 0. Fuzz factor. // decay: float >= 0. Learning rate decay over each update. - // reference : [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) + // reference : [Adam - A Method for Stochastic + // Optimization](http://arxiv.org/abs/1412.6980v8) optional double beta_1 = 41; optional double beta_2 = 42; optional double epsilon = 43; @@ -55,32 +56,32 @@ message AdamConfig { message ConstLrConfig { // learninRate Policy - optional double learning_rate = 1 [default = 1.0]; + optional double learning_rate = 1 [ default = 1.0 ]; } message LinearLrConfig { // learninRate Policy - optional double learning_rate = 1 [default = 1.0]; + optional double learning_rate = 1 [ default = 1.0 ]; optional double lr_decay_a = 2; optional double lr_decay_b = 3; } message TensorProto { -enum DataType { - PADDLE_ELEMENT_TYPE_INT32 = 0; - PADDLE_ELEMENT_TYPE_UINT32 = 1; - PADDLE_ELEMENT_TYPE_INT64 = 2; - PADDLE_ELEMENT_TYPE_UINT64 = 3; - PADDLE_ELEMENT_TYPE_FLOAT32 = 4; - PADDLE_ELEMENT_TYPE_FLOAT64 = 5; -} + enum DataType { + PADDLE_ELEMENT_TYPE_INT32 = 0; + PADDLE_ELEMENT_TYPE_UINT32 = 1; + PADDLE_ELEMENT_TYPE_INT64 = 2; + PADDLE_ELEMENT_TYPE_UINT64 = 3; + PADDLE_ELEMENT_TYPE_FLOAT32 = 4; + PADDLE_ELEMENT_TYPE_FLOAT64 = 5; + } optional DataType data_type = 1; repeated bytes content = 2; } message LrPolicyState { // learninRate Policy - optional double learning_rate = 1 [default = 1.0]; + optional double learning_rate = 1 [ default = 1.0 ]; optional double lr_decay_a = 2; optional double lr_decay_b = 3; } @@ -104,7 +105,6 @@ message AdadeltaOptimizerState { optional TensorProto update_delta = 4; } - message AdagradOptimizerState { optional LrPolicyState lr_state = 101; optional double num_sample_passed = 104; @@ -124,10 +124,10 @@ message AdamOptimizerState { message OptimizerConfig { enum Optimizer { - SGD = 1; - Adadelta = 2; - Adagrad = 3; - Adam = 4; + SGD = 1; + Adadelta = 2; + Adagrad = 3; + Adam = 4; } optional Optimizer optimizer = 1; optional SGDConfig sgd = 3; @@ -136,8 +136,8 @@ message OptimizerConfig { optional AdamConfig adam = 6; enum LrPolicy { - Const = 0; - Linear = 1; + Const = 0; + Linear = 1; } optional LrPolicy lr_policy = 11; optional ConstLrConfig const_lr = 12; diff --git a/proto/ParameterConfig.proto b/proto/ParameterConfig.proto index 580d66324602df4c655dd2f1e1cd87159b5b346b..b13570a2c6e7b16e45892a31bb496a9dd2099df0 100644 --- a/proto/ParameterConfig.proto +++ b/proto/ParameterConfig.proto @@ -27,56 +27,57 @@ enum ParameterInitStrategy { message ParameterUpdaterHookConfig { // hook type such as 'pruning' required string type = 1; - // this represents the ratio of zero element to be set by the Parameter - optional double sparsity_ratio = 2 [default = 0.6]; + // this represents the ratio of zero element to be set by the Parameter + optional double sparsity_ratio = 2 [ default = 0.6 ]; } message ParameterConfig { required string name = 1; required uint64 size = 2; - optional double learning_rate = 3 [default = 1.0]; - optional double momentum = 4 [default = 0.0]; - optional double initial_mean = 5 [default = 0.0]; - optional double initial_std = 6 [default = 0.01]; + optional double learning_rate = 3 [ default = 1.0 ]; + optional double momentum = 4 [ default = 0.0 ]; + optional double initial_mean = 5 [ default = 0.0 ]; + optional double initial_std = 6 [ default = 0.01 ]; // use L2-regularization if decay_rate set and decay_rate_l1 not set - optional double decay_rate = 7 [default = 0.0]; + optional double decay_rate = 7 [ default = 0.0 ]; // use L1-regularization if decay_rate_l1 set - optional double decay_rate_l1 = 8 [default = 0.0]; + optional double decay_rate_l1 = 8 [ default = 0.0 ]; // dims of Parameter, e.g. dims[0] as height, dims[1] as width.. repeated uint64 dims = 9; // the gpu device which the parameter in. // Only used by ParallelNeuralNetork. Ignored otherwise. - optional int32 device = 10 [default = -1]; + optional int32 device = 10 [ default = -1 ]; // how to init the parameter: 0 -> normal, 1 -> uniform // 0: treat initial_mean as mean, intial_std as standard deviation // 1: range is (initial_mean - initial_std) to (initial_mean + initial_std) - optional int32 initial_strategy = 11 [default = 0]; + optional int32 initial_strategy = 11 [ default = 0 ]; // define the variance when init the parameter, by height of the Matrix - optional bool initial_smart = 12 [default = false]; + optional bool initial_smart = 12 [ default = false ]; // apply regularization every # batches - optional int32 num_batches_regularization = 13 [default = 1]; + optional int32 num_batches_regularization = 13 [ default = 1 ]; // if is_sparse is true, para is sparse, else para is dense - optional bool is_sparse = 14[default = false]; - // if para is sparse, format should be "csc" or "csr", empty means is not sparse - optional string format = 15 [default = ""]; + optional bool is_sparse = 14 [ default = false ]; + // if para is sparse, format should be "csc" or "csr", empty means is not + // sparse + optional string format = 15 [ default = "" ]; // sparse remote update or not - optional bool sparse_remote_update = 16 [default = false]; + optional bool sparse_remote_update = 16 [ default = false ]; // gradient clipping threshold, no clipping by default - optional double gradient_clipping_threshold = 17 [default = 0.0]; + optional double gradient_clipping_threshold = 17 [ default = 0.0 ]; // static parameters are fixed when training - optional bool is_static = 18 [default = false]; + optional bool is_static = 18 [ default = false ]; // para_id should NOT be set by config_parser. It is for // internal use. optional uint64 para_id = 19; repeated ParameterUpdaterHookConfig update_hooks = 20; // setup load mat -> csr - optional bool need_compact = 21 [default = false]; + optional bool need_compact = 21 [ default = false ]; // whether to do sparse update for this parameter - optional bool sparse_update = 22 [default = false]; + optional bool sparse_update = 22 [ default = false ]; // whether this parameter is shared or not. - optional bool is_shared = 23 [default = false]; + optional bool is_shared = 23 [ default = false ]; // parameter block size - optional uint64 parameter_block_size = 24 [default = 0]; + optional uint64 parameter_block_size = 24 [ default = 0 ]; } diff --git a/proto/ParameterServerConfig.proto b/proto/ParameterServerConfig.proto index 404f9613792653dda72eeb98f022851adedbfbfd..bd63cf35b1483a45f21de6f0d0d883e4d8432296 100644 --- a/proto/ParameterServerConfig.proto +++ b/proto/ParameterServerConfig.proto @@ -15,13 +15,10 @@ syntax = "proto2"; package paddle; - /** * Configuration structure for ParameterClient2. */ -message ParameterClientConfig { - required int32 trainer_id = 1; -} +message ParameterClientConfig { required int32 trainer_id = 1; } /** * Configuration structure for ParameterServer2. @@ -30,24 +27,24 @@ message ParameterServerConfig { // Number of ports for sending dense parameter, // following ports on parameter server will be visited // for sending dense parameter: [port, port+ports_num-1] - required int32 ports_num = 1 [default = 1]; + required int32 ports_num = 1 [ default = 1 ]; // Number of ports for sending sparse parameter, // following ports on parameter server will be visited // for sending sparse parameter: // [port+ports_num, port+ports_num+ports_num_for_sparse-1] - required int32 ports_num_for_sparse = 2 [default = 0]; + required int32 ports_num_for_sparse = 2 [ default = 0 ]; // network device name for pservers - required string nics = 3 [default = "xgbe0,xgbe1"]; - required string rdma_tcp = 4 [default = "tcp"]; + required string nics = 3 [ default = "xgbe0,xgbe1" ]; + required string rdma_tcp = 4 [ default = "tcp" ]; // Listening port for pserver - required int32 port = 5 [default = 20134]; + required int32 port = 5 [ default = 20134 ]; // number of gradient servers - required int32 num_gradient_servers = 6 [default = 1]; + required int32 num_gradient_servers = 6 [ default = 1 ]; // number of threads for sync op exec - required int32 pserver_num_threads = 7 [default = 1]; + required int32 pserver_num_threads = 7 [ default = 1 ]; // control config_.async_lagged_grad_discard_ratio() min value - required double async_lagged_ratio_min = 8 [default = 1.0]; + required double async_lagged_ratio_min = 8 [ default = 1.0 ]; // if async_lagged_grad_discard_ratio is not set in trainer_config.conf // use it as defalut value - required double async_lagged_ratio_default = 9 [default = 1.5]; + required double async_lagged_ratio_default = 9 [ default = 1.5 ]; } \ No newline at end of file diff --git a/proto/ParameterService.proto b/proto/ParameterService.proto index c1c04d8cc5bdedd09173d5dfa10b82c7ee7ed6a4..e3c180ccc3f2a9bfa13c443944cc5ae3398818a9 100644 --- a/proto/ParameterService.proto +++ b/proto/ParameterService.proto @@ -23,8 +23,8 @@ package paddle; */ enum ParameterUpdateMode { // Set parameter - PSERVER_UPDATE_MODE_SET_PARAM = 0;//use local param - PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1;//set zero param + PSERVER_UPDATE_MODE_SET_PARAM = 0; // use local param + PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1; // set zero param // Update parameter once a gradient is received PSERVER_UPDATE_MODE_ASYNC_SGD = 2; @@ -37,7 +37,7 @@ enum ParameterUpdateMode { // No update. Only get parameters back. PSERVER_UPDATE_MODE_GET_PARAM = 5; - PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6;//only get sparse rows + PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6; // only get sparse rows }; message ParameterBlock { @@ -80,42 +80,34 @@ message SendParameterRequest { optional int32 trainer_id = 7; // send back parameter type on pserver, PARAMETER_VALUE by default - optional int32 send_back_parameter_type = 8 [default = 0]; + optional int32 send_back_parameter_type = 8 [ default = 0 ]; // forwardbackward time in usec optional uint64 forwardbackward_time = 9; - } -message WaitPassStartRequest { -} +message WaitPassStartRequest {} -message WaitPassStartResponse { -} +message WaitPassStartResponse {} -message WaitPassFinishRequest { -} +message WaitPassFinishRequest {} -message WaitPassFinishResponse { -} +message WaitPassFinishResponse {} enum SyncObject { SYNC_DEFAULT = 0; // wait for the synchronizeBarrier_ - SYNC_DATA = 1; // wait for the synchronizeDataBarrier_ + SYNC_DATA = 1; // wait for the synchronizeDataBarrier_ } message SynchronizeRequest { - required SyncObject sync_object_id = 1 [default = SYNC_DEFAULT]; + required SyncObject sync_object_id = 1 [ default = SYNC_DEFAULT ]; optional int32 trainer_id = 2; } -message SynchronizeResponse { -} +message SynchronizeResponse {} -message SendParameterResponse { - repeated ParameterBlock blocks = 1; -} +message SendParameterResponse { repeated ParameterBlock blocks = 1; } message SetConfigRequest { repeated ParameterConfig param_configs = 1; @@ -125,26 +117,18 @@ message SetConfigRequest { required bool is_sparse_server = 6; } -message SetConfigResponse{ -} +message SetConfigResponse {} -message GetStatusRequest { -} +message GetStatusRequest {} -message GetStatusResponse { - required PServerStatus status = 1; -} +message GetStatusResponse { required PServerStatus status = 1; } -message SetStatusRequest { - required PServerStatus status = 1; -} +message SetStatusRequest { required PServerStatus status = 1; } -message SetStatusResponse { -} +message SetStatusResponse {} // create a column vector. The size is the dimension of parameter -message CreateVectorRequest { -} +message CreateVectorRequest {} message CreateVectorResponse { // error message. Empty if success @@ -153,9 +137,7 @@ message CreateVectorResponse { required int64 handle = 2; } -message ReleaseVectorRequest { - required int64 handle = 1; -} +message ReleaseVectorRequest { required int64 handle = 1; } message ReleaseVectorResponse { // error message. Empty if success @@ -164,9 +146,7 @@ message ReleaseVectorResponse { // Create a column major matrix. The number of rows is the dimension // of parameter. The number of columns is specifed by num_cols -message CreateMatrixRequest { - required int32 num_cols = 1; -} +message CreateMatrixRequest { required int32 num_cols = 1; } message CreateMatrixResponse { // error message. Empty if success @@ -175,16 +155,13 @@ message CreateMatrixResponse { required int64 handle = 2; } -message ReleaseMatrixRequest { - required int64 handle = 1; -} +message ReleaseMatrixRequest { required int64 handle = 1; } message ReleaseMatrixResponse { // error message. Empty if success optional string return_message = 1; } - /** * The operations are defined using the variables commented at Operation * and OperationResult @@ -245,36 +222,36 @@ enum MatrixVectorOperation { message ProtoVector { required int64 dim = 1; - repeated double values = 2 [packed = true]; + repeated double values = 2 [ packed = true ]; } message ProtoMatrix { required int64 num_rows = 1; required int64 num_cols = 2; - repeated double values = 3 [packed = true]; + repeated double values = 3 [ packed = true ]; } message Operation { required MatrixVectorOperation operation = 1; // vector handles created on the pserver - repeated int64 pvectors = 2; // u, v, w + repeated int64 pvectors = 2; // u, v, w // matrix handles created on the pserver - repeated int64 pmatrices = 3; // A, B, C + repeated int64 pmatrices = 3; // A, B, C - repeated double scalars = 4; // a, b, c - repeated ProtoVector vectors = 5; // x, y, z - repeated ProtoMatrix matrices = 6; // X, Y, Z + repeated double scalars = 4; // a, b, c + repeated ProtoVector vectors = 5; // x, y, z + repeated ProtoMatrix matrices = 6; // X, Y, Z } message OperationResult { // error message. Empty if success optional string return_message = 1; -// - repeated double scalars = 2; // d, e, f + // + repeated double scalars = 2; // d, e, f repeated ProtoVector vectors = 3; // p, q, r - repeated ProtoMatrix matrices = 4; // P, Q, R + repeated ProtoMatrix matrices = 4; // P, Q, R } message DoOperationRequest { @@ -301,18 +278,14 @@ message DoOperationResponse { required bool pass_finish = 3; } -message LoadValueRequest { - required string dir_name = 1; -} +message LoadValueRequest { required string dir_name = 1; } message LoadValueResponse { // error message. Empty if success optional string return_message = 1; } -message SaveValueRequest { - required string dir_name = 1; -} +message SaveValueRequest { required string dir_name = 1; } message SaveValueResponse { // error message. Empty if success @@ -331,11 +304,11 @@ enum DataUpdateMode { // Client send it's own ref label to pserver DATA_UPDATE_MODE_SET_REF_LABEL = 4; // Client get all ref labels from all pservers - DATA_UPDATE_MODE_GET_REF_LABEL =5; + DATA_UPDATE_MODE_GET_REF_LABEL = 5; // Client send it's own ref grad to pserver - DATA_UPDATE_MODE_SET_REF_GRAD =6; + DATA_UPDATE_MODE_SET_REF_GRAD = 6; // Client get all ref grad from all pservers - DATA_UPDATE_MODE_GET_REF_GRAD =7; + DATA_UPDATE_MODE_GET_REF_GRAD = 7; } enum SendDataType { @@ -360,7 +333,7 @@ message DataBlock { // byte size of one data type required int32 data_size = 2; // data_type - optional TransDataType data_type = 3 [default = TRANS_DOUBLE]; + optional TransDataType data_type = 3 [ default = TRANS_DOUBLE ]; } message SendDataRequest { diff --git a/proto/TrainerConfig.proto b/proto/TrainerConfig.proto index a819d20d11ff3932d331801007b8cfb9c77a3f2b..b7c2355159e66be0a1550d3c8fde9a15346ff7e4 100644 --- a/proto/TrainerConfig.proto +++ b/proto/TrainerConfig.proto @@ -20,14 +20,14 @@ package paddle; message OptimizationConfig { required int32 batch_size = 3; - required string algorithm = 4 [default = "async_sgd"]; - optional int32 num_batches_per_send_parameter = 5 [default = 1]; - optional int32 num_batches_per_get_parameter = 6 [default = 1]; + required string algorithm = 4 [ default = "async_sgd" ]; + optional int32 num_batches_per_send_parameter = 5 [ default = 1 ]; + optional int32 num_batches_per_get_parameter = 6 [ default = 1 ]; required double learning_rate = 7; - optional double learning_rate_decay_a = 8 [default = 0]; - optional double learning_rate_decay_b = 9 [default = 0]; - optional string learning_rate_schedule = 27 [default = "constant"]; + optional double learning_rate_decay_a = 8 [ default = 0 ]; + optional double learning_rate_decay_b = 9 [ default = 0 ]; + optional string learning_rate_schedule = 27 [ default = "constant" ]; // learning rate will be scaled according to learning_rate_schedule // 1), constant: // lr = learning_rate @@ -49,88 +49,92 @@ message OptimizationConfig { // owlqn related // L1-regularization - optional double l1weight = 10 [default = 0.1]; + optional double l1weight = 10 [ default = 0.1 ]; // L2-regularization - optional double l2weight = 11 [default = 0]; + optional double l2weight = 11 [ default = 0 ]; // "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step) // then accept the step - optional double c1 = 12 [default = 0.0001]; + optional double c1 = 12 [ default = 0.0001 ]; // multiply the step with "backoff", when wolfe condition doesn't satisfy - optional double backoff = 13 [default = 0.5]; + optional double backoff = 13 [ default = 0.5 ]; // how many "s"s and "y"s are kept in owlqn - optional int32 owlqn_steps = 14 [default = 10]; + optional int32 owlqn_steps = 14 [ default = 10 ]; // accept the step if encountered "max_backoff" times of "reduce the step" - optional int32 max_backoff = 15 [default = 5]; + optional int32 max_backoff = 15 [ default = 5 ]; // L2-regularization coefficient is reduced linearly from iteration 0 to // "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter" // iterations. set "l2weight_zero_iter" to 0 to disable this strategy. - optional int32 l2weight_zero_iter = 17 [default = 0]; + optional int32 l2weight_zero_iter = 17 [ default = 0 ]; // averaged sgd // About average_window * numBatchProcessed parameter are used // for average. To be accurate, between average_window * numBatchProcessed // and 2 * average_window * numBatchProcessed parameters are used for // average. - optional double average_window = 18 [default = 0]; - optional int64 max_average_window = 19 [default = 0x7fffffffffffffff]; + optional double average_window = 18 [ default = 0 ]; + optional int64 max_average_window = 19 [ default = 0x7fffffffffffffff ]; ////////////////////////// // Options Adaptive SGD // ////////////////////////// - // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta", "rmsprop" - // default learning method("momentum") use global decayed learning rate with momentum. + // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta", + // "rmsprop" + // default learning method("momentum") use global decayed learning rate with + // momentum. // "adagrad", "adadelta" and "rmsprop" can set momentum too. - optional string learning_method = 23 [default = "momentum"]; - optional double ada_epsilon = 24 [default = 1e-6]; - optional double ada_rou = 26 [default = 0.95]; + optional string learning_method = 23 [ default = "momentum" ]; + optional double ada_epsilon = 24 [ default = 1e-6 ]; + optional double ada_rou = 26 [ default = 0.95 ]; // Force to do average in cpu in order to save gpu memory usage - optional bool do_average_in_cpu = 25 [default = false]; + optional bool do_average_in_cpu = 25 [ default = false ]; // delta add rate in pserver, used while num_batches_per_send_parameter>1 // will be divided by #machines automatically. - optional double delta_add_rate = 28 [default = 1.0]; + optional double delta_add_rate = 28 [ default = 1.0 ]; // We split a large size into smaller mini-batches, whose sizes are // determined by mini_batch_size. It only takes effect when there is // an ExternalMachine. - optional int32 mini_batch_size = 29 [default = 128]; + optional int32 mini_batch_size = 29 [ default = 128 ]; // automatically set if any one of parameters set sparse remote update flag - optional bool use_sparse_remote_updater = 30 [default = false]; + optional bool use_sparse_remote_updater = 30 [ default = false ]; - // how to update center parameter and feedback to local parameter, + // how to update center parameter and feedback to local parameter, // when use local sgd update in cluster training. - // A option is elastic_average, proposed by the paper: Deep learning with elastic averaging SGD. - // If use elastic_average method, every trainer node should sample from whole data sets. - optional string center_parameter_update_method = 31 [default = "average"]; + // A option is elastic_average, proposed by the paper: Deep learning with + // elastic averaging SGD. + // If use elastic_average method, every trainer node should sample from whole + // data sets. + optional string center_parameter_update_method = 31 [ default = "average" ]; // shrink sparse parameter value // only works if parameter is remote sparse update and has L1 decay rate - optional double shrink_parameter_value = 32 [default = 0]; + optional double shrink_parameter_value = 32 [ default = 0 ]; //////////////////////////// // Options Adam Optimizer // //////////////////////////// - optional double adam_beta1 = 33 [default = 0.9]; - optional double adam_beta2 = 34 [default = 0.999]; - optional double adam_epsilon = 35 [default = 1e-8]; + optional double adam_beta1 = 33 [ default = 0.9 ]; + optional double adam_beta2 = 34 [ default = 0.999 ]; + optional double adam_epsilon = 35 [ default = 1e-8 ]; // arguments for learning rate scheduler // Format: num1:rate1,num2:rate2,...,numK:rateK // For learning_rate_schedule="manual", num is the number of samples, // For learning_rate_schedule="pass_manual", // num is the number of passes (starting from 0) - optional string learning_rate_args = 36 [default = ""]; - + optional string learning_rate_args = 36 [ default = "" ]; + // for async sgd gradient commit control. // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed, // current async gradient will be discard silently. - optional double async_lagged_grad_discard_ratio = 37 [default = 1.5]; + optional double async_lagged_grad_discard_ratio = 37 [ default = 1.5 ]; - // global threshold for gradient clipping - optional double gradient_clipping_threshold = 38 [default = 0.0]; + // global threshold for gradient clipping + optional double gradient_clipping_threshold = 38 [ default = 0.0 ]; }; message TrainerConfig { @@ -141,7 +145,7 @@ message TrainerConfig { repeated string config_files = 5; // the directory to save/load model files for each training path - optional string save_dir = 6 [default = "./output/model"]; + optional string save_dir = 6 [ default = "./output/model" ]; // Path of the initial model parameters. // If it was set, start_pass will be ignored. @@ -149,7 +153,7 @@ message TrainerConfig { // Start training from this pass. // Will load parameter from the previous pass. - optional int32 start_pass = 8 [default = 0]; + optional int32 start_pass = 8 [ default = 0 ]; // file path to the trainer config file optional string config_file = 9;