提交 a1e16bb5 编写于 作者: Y Yu Yang

Merge branch 'develop' into feature/change_op_creation

......@@ -24,7 +24,7 @@
description: Format files with ClangFormat.
entry: clang-format -i
language: system
files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks:
......
......@@ -36,8 +36,8 @@ include(simd)
################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND})
option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND})
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." OFF)
option(WITH_MKLML "Compile PaddlePaddle with mklml package." OFF)
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
......@@ -55,7 +55,6 @@ option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(UNITTEST_USE_VIRTUALENV "Python unittest with virtualenv" ON)
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
......
......@@ -27,13 +27,16 @@ RUN apt-get update && \
git python-pip python-dev openssh-server bison \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-numpy python-matplotlib gcc-4.8 g++-4.8 \
python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format-3.8 swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools && \
apt-get clean -y
# paddle is using numpy.flip, which is introduced since 1.12.0
RUN pip --no-cache-dir install 'numpy>=1.12.0'
# Install Go and glide
RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go.tgz && \
......
......@@ -56,11 +56,14 @@ macro(add_style_check_target TARGET_NAME)
# cpplint code style
get_filename_component(base_filename ${filename} NAME)
set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
add_custom_command(OUTPUT ${CUR_GEN} PRE_BUILD
COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
"--filter=${STYLE_FILTER}"
"--write-success=${CUR_GEN}" ${filename}
DEPENDS ${filename} ${PROJ_ROOT}/paddle/scripts/cpplint.py
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
add_custom_target(${base_filename}.cpplint DEPENDS ${CUR_GEN})
add_dependencies(${TARGET_NAME} ${base_filename}.cpplint)
endif()
endforeach()
endif()
......
......@@ -118,7 +118,6 @@ endfunction()
macro(add_unittest_without_exec TARGET_NAME)
add_executable(${TARGET_NAME} ${ARGN})
link_paddle_test(${TARGET_NAME})
add_style_check_target(${TARGET_NAME} ${ARGN})
endmacro()
# add_unittest
......@@ -150,12 +149,6 @@ endfunction()
# Create a python unittest using run_python_tests.sh,
# which takes care of making correct running environment
function(add_python_test TEST_NAME)
if (UNITTEST_USE_VIRTUALENV)
add_test(NAME ${TEST_NAME}
COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR}
bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh ${ARGN}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
else()
foreach(arg ${ARGN})
get_filename_component(py_fn ${arg} NAME_WE)
set(TRG_NAME ${TEST_NAME}_${py_fn})
......@@ -164,5 +157,4 @@ function(add_python_test TEST_NAME)
python2 ${arg}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endforeach()
endif()
endfunction()
......@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_batch_transpose.h"
#include "hl_base.h"
#include "hl_batch_transpose.h"
const int TILE_DIM = 64;
const int BLOCK_ROWS = 16;
// No bank-conflict transpose for a batch of data.
__global__ void batchTransposeNoBankConflicts(real* odata,
const real* idata,
int numSamples, int width,
int height) {
__global__ void batchTransposeNoBankConflicts(
real* odata, const real* idata, int numSamples, int width, int height) {
__shared__ float tile[TILE_DIM][TILE_DIM + 1];
const int x = blockIdx.x * TILE_DIM + threadIdx.x;
......@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
newX] = tile[threadIdx.x][j];
}
void batchTranspose(const real* input, real* output, int width, int height,
int batchSize) {
void batchTranspose(
const real* input, real* output, int width, int height, int batchSize) {
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
(output, input, batchSize, width, height);
batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
output, input, batchSize, width, height);
CHECK_SYNC("batchTranspose failed!");
}
......@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_aggregate.h"
#include "hl_base.h"
#include "hl_cuda.h"
#include "hl_cuda.ph"
#include "hl_aggregate.h"
#include "hl_thread.ph"
#include "hl_matrix_base.cuh"
#include "hl_thread.ph"
#include "paddle/utils/Logging.h"
/**
* @brief matrix row operator.
*/
template<class Agg, int blockSize>
__global__ void KeMatrixRowOp(Agg agg,
real *E,
real *Sum,
int dimN) {
template <class Agg, int blockSize>
__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
__shared__ real sum_s[blockSize];
int cnt = (dimN + blockSize -1) / blockSize;
int rowId = blockIdx.x + blockIdx.y*gridDim.x;
int index = rowId*dimN;
int cnt = (dimN + blockSize - 1) / blockSize;
int rowId = blockIdx.x + blockIdx.y * gridDim.x;
int index = rowId * dimN;
int tid = threadIdx.x;
int lmt = tid;
......@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
sum_s[tid] = tmp;
__syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) {
for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) {
sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
}
......@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
}
template <class Agg>
void hl_matrix_row_op(Agg agg,
real *A_d,
real *C_d,
int dimM,
int dimN) {
void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
int blocksX = dimM;
int blocksY = 1;
dim3 threads(128, 1);
dim3 grid(blocksX, blocksY);
KeMatrixRowOp<Agg, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
(agg, A_d, C_d, dimN);
KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
agg, A_d, C_d, dimN);
}
void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::sum(),
A_d,
C_d,
dimM,
dimN);
hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
CHECK_SYNC("hl_matrix_row_sum failed");
}
......@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::max(),
A_d,
C_d,
dimM,
dimN);
hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
CHECK_SYNC("hl_matrix_row_max failed");
}
......@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::min(),
A_d,
C_d,
dimM,
dimN);
hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
CHECK_SYNC("hl_matrix_row_min failed");
}
/**
* @brief matrix column operator.
*/
template<class Agg>
__global__ void KeMatrixColumnOp(Agg agg,
real *E,
real *Sum,
int dimM,
int dimN) {
template <class Agg>
__global__ void KeMatrixColumnOp(
Agg agg, real *E, real *Sum, int dimM, int dimN) {
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
real tmp = agg.init();
if (rowIdx < dimN) {
......@@ -127,13 +104,10 @@ __global__ void KeMatrixColumnOp(Agg agg,
}
}
template<class Agg, int blockDimX, int blockDimY>
__global__ void KeMatrixColumnOp_S(Agg agg,
real *E,
real *Sum,
int dimM,
int dimN) {
__shared__ real _sum[blockDimX*blockDimY];
template <class Agg, int blockDimX, int blockDimY>
__global__ void KeMatrixColumnOp_S(
Agg agg, real *E, real *Sum, int dimM, int dimN) {
__shared__ real _sum[blockDimX * blockDimY];
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int index = threadIdx.y;
......@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
index += blockDimY;
}
}
_sum[threadIdx.x + threadIdx.y*blockDimX] = tmp;
_sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
__syncthreads();
if (rowIdx < dimN) {
if (threadIdx.y ==0) {
if (threadIdx.y == 0) {
real tmp = agg.init();
for (int i=0; i < blockDimY; i++) {
tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]);
for (int i = 0; i < blockDimY; i++) {
tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
}
Sum[rowIdx] = tmp;
}
......@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
}
template <class Agg>
void hl_matrix_column_op(Agg agg,
real *A_d,
real *C_d,
int dimM,
int dimN) {
void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
if (dimN >= 8192) {
int blocksX = (dimN + 128 -1) / 128;
int blocksX = (dimN + 128 - 1) / 128;
int blocksY = 1;
dim3 threads(128, 1);
dim3 grid(blocksX, blocksY);
KeMatrixColumnOp<Agg><<< grid, threads, 0, STREAM_DEFAULT >>>
(agg, A_d, C_d, dimM, dimN);
KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
agg, A_d, C_d, dimM, dimN);
} else {
int blocksX = (dimN + 32 -1) / 32;
int blocksX = (dimN + 32 - 1) / 32;
int blocksY = 1;
dim3 threads(32, 32);
dim3 grid(blocksX, blocksY);
KeMatrixColumnOp_S<Agg, 32, 32><<< grid, threads, 0, STREAM_DEFAULT>>>
(agg, A_d, C_d, dimM, dimN);
KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
agg, A_d, C_d, dimM, dimN);
}
return;
......@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::sum(),
A_d,
C_d,
dimM,
dimN);
hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
CHECK_SYNC("hl_matrix_column_sum failed");
}
......@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::max(),
A_d,
C_d,
dimM,
dimN);
hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
CHECK_SYNC("hl_matrix_column_max failed");
}
......@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::min(),
A_d,
C_d,
dimM,
dimN);
hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
CHECK_SYNC("hl_matrix_column_min failed");
}
......@@ -226,16 +184,16 @@ template <int blockSize>
__global__ void KeVectorSum(real *E, real *Sum, int dimM) {
__shared__ double sum_s[blockSize];
int tid = threadIdx.x;
int index = blockIdx.y*blockDim.x+threadIdx.x;
int index = blockIdx.y * blockDim.x + threadIdx.x;
sum_s[tid] = 0.0f;
while (index < dimM) {
sum_s[tid] += E[index];
index += blockDim.x*gridDim.y;
index += blockDim.x * gridDim.y;
}
__syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) {
for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) {
sum_s[tid] += sum_s[tid + stride];
}
......@@ -261,36 +219,37 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
hl_event_t hl_event = &hl_event_st;
while (!hl_cuda_event_is_ready(hl_event)) {}
while (!hl_cuda_event_is_ready(hl_event)) {
}
KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
(A_d, t_resource.gpu_mem, dimM);
KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
(t_resource.gpu_mem, t_resource.cpu_mem, 128);
KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
A_d, t_resource.gpu_mem, dimM);
KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
t_resource.gpu_mem, t_resource.cpu_mem, 128);
hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
cudaError_t err = (cudaError_t)hl_get_device_last_error();
CHECK_EQ(cudaSuccess, err)
<< "CUDA error: " << hl_get_device_error_string((size_t)err);
CHECK_EQ(cudaSuccess, err) << "CUDA error: "
<< hl_get_device_error_string((size_t)err);
}
template <int blockSize>
__global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
__shared__ double sum_s[blockSize];
int tid = threadIdx.x;
int index = blockIdx.y*blockDim.x+threadIdx.x;
int index = blockIdx.y * blockDim.x + threadIdx.x;
sum_s[tid] = 0.0f;
while (index < dimM) {
sum_s[tid] += abs(E[index]);
index += blockDim.x*gridDim.y;
index += blockDim.x * gridDim.y;
}
__syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) {
for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) {
sum_s[tid] += sum_s[tid + stride];
}
......@@ -316,18 +275,19 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
hl_event_t hl_event = &hl_event_st;
while (!hl_cuda_event_is_ready(hl_event)) {}
while (!hl_cuda_event_is_ready(hl_event)) {
}
KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
(A_d, t_resource.gpu_mem, dimM);
KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
(t_resource.gpu_mem, t_resource.cpu_mem, 128);
KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
A_d, t_resource.gpu_mem, dimM);
KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
t_resource.gpu_mem, t_resource.cpu_mem, 128);
hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
cudaError_t err = (cudaError_t)hl_get_device_last_error();
CHECK_EQ(cudaSuccess, err)
<< "CUDA error: " << hl_get_device_error_string((size_t)err);
CHECK_EQ(cudaSuccess, err) << "CUDA error: "
<< hl_get_device_error_string((size_t)err);
}
此差异已折叠。
此差异已折叠。
......@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
#include "hl_matrix.h"
#include "hl_matrix_ops.cuh"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_ops.cuh"
#include "hl_sequence.h"
#include "hl_sparse.ph"
#include "paddle/utils/Logging.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
void hl_matrix_add(real *A_d,
real *B_d,
real *C_d,
DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
void hl_matrix_add(real* A_d,
real* B_d,
real* C_d,
int dimM,
int dimN,
real alpha,
......@@ -36,8 +35,8 @@ void hl_matrix_add(real *A_d,
CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d);
hl_gpu_apply_ternary_op
<real, ternary::_add<real>, 0, 0>(ternary::_add<real>(alpha, beta),
hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
ternary::_add<real>(alpha, beta),
A_d,
B_d,
C_d,
......@@ -50,12 +49,11 @@ void hl_matrix_add(real *A_d,
}
#ifdef PADDLE_TYPE_DOUBLE
#define THRESHOLD 128
#define THRESHOLD 128
#else
#define THRESHOLD 64
#define THRESHOLD 64
#endif
__device__ __forceinline__
void findMax(real* I,
__device__ __forceinline__ void findMax(real* I,
real* dfMax_s,
int blockSize,
int base,
......@@ -89,8 +87,7 @@ void findMax(real* I,
__syncthreads();
}
__device__ __forceinline__
void subMaxAndExp(real* I,
__device__ __forceinline__ void subMaxAndExp(real* I,
real* O,
int curIdx,
int nextIdx,
......@@ -115,8 +112,7 @@ void subMaxAndExp(real* I,
__syncthreads();
}
__device__ __forceinline__
void valueSum(real* O,
__device__ __forceinline__ void valueSum(real* O,
real* dfMax_s,
int blockSize,
int base,
......@@ -141,13 +137,8 @@ void valueSum(real* O,
__syncthreads();
}
__device__ __forceinline__
void divSum(real* O,
real sum,
int curIdx,
int nextIdx,
int blockSize,
int dimN) {
__device__ __forceinline__ void divSum(
real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
while (curIdx < dimN) {
O[nextIdx] /= sum;
nextIdx += blockSize;
......@@ -155,8 +146,7 @@ void divSum(real* O,
}
}
__device__ __forceinline__
void softmax(real* I,
__device__ __forceinline__ void softmax(real* I,
real* O,
real* dfMax_s,
int blockSize,
......@@ -167,8 +157,7 @@ void softmax(real* I,
__shared__ real max;
// find the max number
findMax(I, dfMax_s, blockSize, base, curIdx,
nextIdx, dimN, &max);
findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
// sub max Value and do Exp operation
subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
......@@ -181,8 +170,8 @@ void softmax(real* I,
divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
}
template<int blockSize>
__global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
template <int blockSize>
__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
int base = threadIdx.x;
__shared__ real dfMax_s[blockSize];
int nextIdx = blockIdx.x * dimN + base;
......@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
}
void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {
void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
dim3 block(512, 1);
dim3 grid(dimM, 1);
KeMatrixSoftMax<512>
<<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
CHECK_SYNC("hl_matrix_softmax failed");
}
template<int blockSize>
__global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
template <int blockSize>
__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
int base = threadIdx.x;
int bid = blockIdx.x;
__shared__ real dfMax_s[blockSize];
......@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
}
void hl_sequence_softmax_forward(real *A_d,
real *C_d,
void hl_sequence_softmax_forward(real* A_d,
real* C_d,
const int* index,
int numSequence) {
CHECK_NOTNULL(A_d);
......@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
dim3 block(512, 1);
dim3 grid(numSequence, 1);
KeSequenceSoftMax<512>
<<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
CHECK_SYNC("hl_sequence_softmax_forward failed");
}
__global__ void KeMatrixDerivative(real *grad_d,
real *output_d,
real *sftmaxSum_d,
int dimM,
int dimN) {
int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
__global__ void KeMatrixDerivative(
real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
int index;
if (rowIdx < dimM && colIdx < dimN) {
index = rowIdx*dimN + colIdx;
index = rowIdx * dimN + colIdx;
grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
}
}
void hl_matrix_softmax_derivative(real *grad_d,
real *output_d,
real *sftmaxSum_d,
int dimM,
int dimN) {
void hl_matrix_softmax_derivative(
real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
CHECK_NOTNULL(grad_d);
CHECK_NOTNULL(output_d);
CHECK_NOTNULL(sftmaxSum_d);
int blocksX = (dimM + 0) / 1;
int blocksY = (dimN + 1024 -1) / 1024;
int blocksY = (dimN + 1024 - 1) / 1024;
dim3 threads(1, 1024);
dim3 grid(blocksX, blocksY);
KeMatrixDerivative<<< grid, threads, 0, STREAM_DEFAULT >>>
(grad_d, output_d, sftmaxSum_d, dimM, dimN);
KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
grad_d, output_d, sftmaxSum_d, dimM, dimN);
CHECK_SYNC("hl_matrix_softmax_derivative failed");
}
__global__ void KeMatrixMultiBinaryCrossEntropy(real* output,
real* entropy,
int* row,
int* col,
int dimM,
int dimN) {
__global__ void KeMatrixMultiBinaryCrossEntropy(
real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < dimM) {
for (int i = 0; i < dimN; i ++) {
for (int i = 0; i < dimN; i++) {
entropy[index] -= log(1 - output[index * dimN + i]);
}
int *row_col = col + row[index];
int* row_col = col + row[index];
int col_num = row[index + 1] - row[index];
for (int i = 0; i < col_num; i ++) {
for (int i = 0; i < col_num; i++) {
real o = output[index * dimN + row_col[i]];
entropy[index] -= log(o / (1 - o));
}
......@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
dim3 threads(n_threads);
dim3 grid(blocks);
hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
KeMatrixMultiBinaryCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>>
(output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
}
__global__ void KeMatrixMultiBinaryCrossEntropyBp(real* output,
real* grad,
int* row,
int* col,
int dimM,
int dimN) {
__global__ void KeMatrixMultiBinaryCrossEntropyBp(
real* output, real* grad, int* row, int* col, int dimM, int dimN) {
int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (row_idx < dimM) {
for (int i = 0; i < dimN; i ++) {
for (int i = 0; i < dimN; i++) {
int index = row_idx * dimN + i;
grad[index] += 1.0 / (1 - output[index]);
}
int col_num = row[row_idx + 1] - row[row_idx];
int *row_col = col + row[row_idx];
for (int i = 0; i < col_num; i ++) {
int* row_col = col + row[row_idx];
for (int i = 0; i < col_num; i++) {
int index = row_idx * dimN + row_col[i];
grad[index] -= 1.0 / (output[index] * (1 - output[index]));
}
}
}
void hl_matrix_multi_binary_cross_entropy_bp(real* output,
real* grad,
hl_sparse_matrix_s csr_mat,
int dimM,
int dimN) {
void hl_matrix_multi_binary_cross_entropy_bp(
real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
CHECK_NOTNULL(output);
CHECK_NOTNULL(grad);
CHECK_NOTNULL(csr_mat);
......@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
dim3 threads(n_threads);
dim3 grid(blocks);
hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
KeMatrixMultiBinaryCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>>
(output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
}
__global__ void KeMatrixCrossEntropy(real* O,
real* E,
int* label,
int dimM,
int dimN) {
__global__ void KeMatrixCrossEntropy(
real* O, real* E, int* label, int dimM, int dimN) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int newBase;
if (index < dimM) {
......@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
}
}
void hl_matrix_cross_entropy(real* A_d,
real* C_d,
int* label_d,
int dimM,
int dimN) {
void hl_matrix_cross_entropy(
real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
int blocks = (dimM + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KeMatrixCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>>
(A_d, C_d, label_d, dimM, dimN);
KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
A_d, C_d, label_d, dimM, dimN);
CHECK_SYNC("hl_matrix_cross_entropy failed");
}
__global__ void KeMatrixCrossEntropyBp(real* grad_d,
real* output_d,
int* label_d,
int dimM,
int dimN) {
int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
__global__ void KeMatrixCrossEntropyBp(
real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
int index;
if (rowIdx < dimM && colIdx < dimN) {
index = rowIdx*dimN + colIdx;
index = rowIdx * dimN + colIdx;
if (label_d[rowIdx] == colIdx) {
grad_d[index] -= 1.0f / output_d[index];
}
}
}
void hl_matrix_cross_entropy_bp(real* grad_d,
real* output_d,
int* label_d,
int dimM,
int dimN) {
void hl_matrix_cross_entropy_bp(
real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
CHECK_NOTNULL(grad_d);
CHECK_NOTNULL(output_d);
CHECK_NOTNULL(label_d);
int blocksX = (dimM + 0)/1;
int blocksY = (dimN + 1024 -1) / 1024;
int blocksX = (dimM + 0) / 1;
int blocksY = (dimN + 1024 - 1) / 1024;
dim3 threads(1, 1024);
dim3 grid(blocksX, blocksY);
KeMatrixCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>>
(grad_d, output_d, label_d, dimM, dimN);
KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
grad_d, output_d, label_d, dimM, dimN);
CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
}
void hl_matrix_zero_mem(real* data, int num) {
hl_gpu_apply_unary_op(
unary::Zero<real>(), data, 1, num, num);
hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
}
__global__ void KeParamReluForward(real* output,
......@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
int ty = blockIdx.y * blockDim.y + threadIdx.y;
if (tx < width && ty < height) {
int index = ty * width + tx;
output[index] = input[index] > 0 ? input[index] :
input[index] * w[tx / partial_sum];
output[index] =
input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
}
}
......@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
CHECK_NOTNULL(w);
dim3 threads(16, 16);
int blockX = (width + 16 - 1) / 16;
int blockY = (height + 16 -1) / 16;
int blockY = (height + 16 - 1) / 16;
dim3 grid(blockX, blockY);
KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>
(output, input, w, width, height, partial_sum);
KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
output, input, w, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_forward failed");
}
template<int blockSize>
template <int blockSize>
__global__ void KeParamReluBackWardW(real* grad_w,
real* grad_o,
real* input,
......@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
int grid_num = width / partial_sum;
dim3 threads(blockSize, 1);
dim3 grid(grid_num, 1);
KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
(grad_w, grad_o, input, width, height, partial_sum);
KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
grad_w, grad_o, input, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_backward_w failed");
}
......@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
CHECK_NOTNULL(diff);
dim3 threads(16, 16);
int blockX = (width + 16 - 1) / 16;
int blockY = (height + 16 -1) / 16;
int blockY = (height + 16 - 1) / 16;
dim3 grid(blockX, blockY);
KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>
(grad_o, data, w, diff, width, height, partial_sum);
KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
grad_o, data, w, diff, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_backward_diff failed");
}
__global__ void KeMatrixAddSharedBias(real* A,
real* B,
const int channel,
const int M,
const int N,
real scale) {
__global__ void KeMatrixAddSharedBias(
real* A, real* B, const int channel, const int M, const int N, real scale) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int dim = N / channel;
if (index < M * N) {
......@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
real scale) {
const int blocks = 512;
const int grids = DIVUP(dimM * dimN, blocks);
KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>
(A_d, B_d, channel, dimM, dimN, scale);
KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
A_d, B_d, channel, dimM, dimN, scale);
CHECK_SYNC("hl_matrix_add_shared_bias failed");
}
template <int blockSize>
__global__ void KeMatrixCollectSharedBias(real *B,
real *A,
__global__ void KeMatrixCollectSharedBias(real* B,
real* A,
const int channel,
const int M,
const int N,
......@@ -611,14 +563,13 @@ void hl_matrix_collect_shared_bias(real* B_d,
const int limit = 64;
int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
KeMatrixCollectSharedBias<blocks>
<<< grids, blocks, 0, STREAM_DEFAULT>>>
(B_d, A_d, channel, dimM, dimN, dim, limit, scale);
KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
B_d, A_d, channel, dimM, dimN, dim, limit, scale);
CHECK_SYNC("hl_matrix_collect_shared_bias failed");
}
__global__ void keMatrixRotate(real* mat, real* matRot,
int dimM, int dimN, bool clockWise) {
__global__ void keMatrixRotate(
real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < dimM * dimN) {
int i = idx / dimN;
......@@ -631,13 +582,13 @@ __global__ void keMatrixRotate(real* mat, real* matRot,
}
}
void hl_matrix_rotate(real *mat, real* matRot,
int dimM, int dimN, bool clockWise) {
void hl_matrix_rotate(
real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
CHECK_NOTNULL(mat);
CHECK_NOTNULL(matRot);
const int threads = 512;
const int blocks = DIVUP(dimM * dimN, threads);
keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>>
(mat, matRot, dimM, dimN, clockWise);
keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
mat, matRot, dimM, dimN, clockWise);
CHECK_SYNC("hl_matrix_rotate failed");
}
......@@ -16,36 +16,36 @@ limitations under the License. */
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h"
__global__ void KeMaxSequenceForward(real *input,
const int *sequence,
__global__ void KeMaxSequenceForward(real* input,
const int* sequence,
real* output,
int *index,
int* index,
int numSequences,
int dim) {
int dimIdx = threadIdx.x;
int sequenceId = blockIdx.x;
if (sequenceId >= numSequences) return;
int start = sequence[sequenceId];
int end = sequence[sequenceId+1];
int end = sequence[sequenceId + 1];
for (int i = dimIdx; i < dim; i += blockDim.x) {
real tmp = -HL_FLOAT_MAX;
int tmpId = -1;
for (int insId = start; insId < end; insId++) {
if (tmp < input[insId*dim + i]) {
tmp = input[insId*dim + i];
if (tmp < input[insId * dim + i]) {
tmp = input[insId * dim + i];
tmpId = insId;
}
}
output[sequenceId*dim + i] = tmp;
index[sequenceId*dim + i] = tmpId;
output[sequenceId * dim + i] = tmp;
index[sequenceId * dim + i] = tmpId;
}
}
void hl_max_sequence_forward(real* input,
const int* sequence,
real* output,
int *index,
int* index,
int numSequences,
int dim) {
CHECK_NOTNULL(input);
......@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
dim3 threads(256, 1);
dim3 grid(numSequences, 1);
KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>>
(input, sequence, output, index, numSequences, dim);
KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
input, sequence, output, index, numSequences, dim);
CHECK_SYNC("hl_max_sequence_forward failed");
}
__global__ void KeMaxSequenceBackward(real *outputGrad,
int *index,
real* inputGrad,
int numSequences,
int dim) {
__global__ void KeMaxSequenceBackward(
real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int colIdx = idx % dim;
if (idx < numSequences*dim) {
if (idx < numSequences * dim) {
int insId = index[idx];
inputGrad[insId * dim + colIdx] += outputGrad[idx];
}
}
void hl_max_sequence_backward(real* outputGrad,
int *index,
real* inputGrad,
int numSequences,
int dim) {
void hl_max_sequence_backward(
real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
CHECK_NOTNULL(outputGrad);
CHECK_NOTNULL(index);
CHECK_NOTNULL(inputGrad);
......@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
dim3 threads(128, 1);
dim3 grid(blocks, 1);
KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>>
(outputGrad, index, inputGrad, numSequences, dim);
KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
outputGrad, index, inputGrad, numSequences, dim);
CHECK_SYNC("hl_max_sequence_backward failed");
}
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output,
real* table,
int* ids,
......@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
while (sampleId < numSamples) {
int tableId = ids[sampleId];
if ((0 <= tableId) && (tableId < tableSize)) {
real *outputData = output + sampleId * dim;
real *tableData = table + tableId * dim;
real* outputData = output + sampleId * dim;
real* tableData = table + tableId * dim;
for (int i = idx; i < dim; i += blockDimX) {
if (AddRow == 0) {
outputData[i] += tableData[i];
......@@ -114,15 +108,18 @@ __global__ void KeMatrixAddRows(real* output,
}
}
}
sampleId += blockDimY*gridDimX;
sampleId += blockDimY * gridDimX;
}
}
template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
__global__
void KeSequence2Batch(real *batch,
real *sequence,
const int *batchIndex,
template <int blockDimX,
int blockDimY,
int gridDimX,
bool seq2batch,
bool isAdd>
__global__ void KeSequence2Batch(real* batch,
real* sequence,
const int* batchIndex,
int seqWidth,
int batchCount) {
int idx = threadIdx.x;
......@@ -130,8 +127,8 @@ void KeSequence2Batch(real *batch,
int id = blockIdx.x + idy * gridDimX;
while (id < batchCount) {
int seqId = batchIndex[id];
real* batchData = batch + id*seqWidth;
real* seqData = sequence + seqId*seqWidth;
real* batchData = batch + id * seqWidth;
real* seqData = sequence + seqId * seqWidth;
for (int i = idx; i < seqWidth; i += blockDimX) {
if (seq2batch) {
if (isAdd) {
......@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
}
}
}
id += blockDimY*gridDimX;
id += blockDimY * gridDimX;
}
}
void hl_sequence2batch_copy(real *batch,
real *sequence,
const int *batchIndex,
void hl_sequence2batch_copy(real* batch,
real* sequence,
const int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch) {
......@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
dim3 threads(128, 8);
dim3 grid(8, 1);
if (seq2batch) {
KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
(batch, sequence, batchIndex, seqWidth, batchCount);
KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, batchIndex, seqWidth, batchCount);
} else {
KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
(batch, sequence, batchIndex, seqWidth, batchCount);
KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, batchIndex, seqWidth, batchCount);
}
CHECK_SYNC("hl_sequence2batch_copy failed");
}
void hl_sequence2batch_add(real *batch,
real *sequence,
int *batchIndex,
void hl_sequence2batch_add(real* batch,
real* sequence,
int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch) {
......@@ -186,18 +183,17 @@ void hl_sequence2batch_add(real *batch,
dim3 threads(128, 8);
dim3 grid(8, 1);
if (seq2batch) {
KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
(batch, sequence, batchIndex, seqWidth, batchCount);
KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, batchIndex, seqWidth, batchCount);
} else {
KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
(batch, sequence, batchIndex, seqWidth, batchCount);
KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, batchIndex, seqWidth, batchCount);
}
CHECK_SYNC("hl_sequence2batch_add failed");
}
template<bool normByTimes, bool seq2batch>
__global__
void KeSequence2BatchPadding(real* batch,
template <bool normByTimes, bool seq2batch>
__global__ void KeSequence2BatchPadding(real* batch,
real* sequence,
const int* sequenceStartPositions,
const size_t sequenceWidth,
......@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
if (seq2batch) {
/* sequence -> batch */
if (normByTimes) {
KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
batch, sequence, sequenceStartPositions,
sequenceWidth, maxSequenceLength, numSequences);
KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch,
sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} else {
KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
batch, sequence, sequenceStartPositions,
sequenceWidth, maxSequenceLength, numSequences);
KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch,
sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
}
} else {
/* batch -> sequence */
if (normByTimes) {
KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
batch, sequence, sequenceStartPositions,
sequenceWidth, maxSequenceLength, numSequences);
KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch,
sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} else {
KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
batch, sequence, sequenceStartPositions,
sequenceWidth, maxSequenceLength, numSequences);
KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch,
sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
}
}
CHECK_SYNC("hl_sequence2batch_copy_padding failed");
}
__device__ inline float my_rsqrt(float x) {
return rsqrtf(x);
}
__device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
__device__ inline double my_rsqrt(double x) {
return rsqrt(x);
}
__device__ inline double my_rsqrt(double x) { return rsqrt(x); }
__global__ void KeSequenceAvgForward(real* dst,
real* src,
......@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
for (int i = start; i < end; i++) {
sum += src[i * width + col];
}
sum = mode == 1 ? sum :
(mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
: sum * my_rsqrt((real)seqLength));
dst[gid] += sum;
}
}
......@@ -349,8 +357,8 @@ void hl_sequence_avg_forward(real* dst,
CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_forward!";
KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>>
(dst, src, starts, height, width, mode);
KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_forward failed");
}
......@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
int seqLength = end - start;
if (seqLength == 0) return;
real grad = src[gid];
grad = mode == 1 ? grad :
(mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength));
grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
: grad * my_rsqrt((real)seqLength));
for (int i = start; i < end; i++) {
dst[i * width + col] += grad;
}
......@@ -394,7 +402,7 @@ void hl_sequence_avg_backward(real* dst,
CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_backward!";
KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>>
(dst, src, starts, height, width, mode);
KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_backward failed");
}
此差异已折叠。
......@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cmath>
#include <stdlib.h>
#include "hl_cuda.h"
#include "hl_time.h"
#include <cmath>
#include "hl_base.h"
#include "hl_cuda.h"
#include "hl_perturbation_util.cuh"
#include "hl_time.h"
#define _USE_MATH_DEFINES
......@@ -30,10 +29,16 @@ limitations under the License. */
* centerX, centerY: translation.
* sourceX, sourceY: output coordinates in the original image.
*/
__device__ void getTranformCoord(int x, int y, real theta, real scale,
real tgtCenter, real imgCenter,
real centerR, real centerC,
int* sourceX, int* sourceY) {
__device__ void getTranformCoord(int x,
int y,
real theta,
real scale,
real tgtCenter,
real imgCenter,
real centerR,
real centerC,
int* sourceX,
int* sourceY) {
real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
// compute coornidates in the rotated and scaled image
......@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
* created by Wei Xu (genome), converted by Jiang Wang
*/
__global__ void kSamplingPatches(const real* imgs, real* targets,
int imgSize, int tgtSize, const int channels,
int samplingRate, const real* thetas,
const real* scales, const int* centerRs,
const int* centerCs, const real padValue,
__global__ void kSamplingPatches(const real* imgs,
real* targets,
int imgSize,
int tgtSize,
const int channels,
int samplingRate,
const real* thetas,
const real* scales,
const int* centerRs,
const int* centerCs,
const real padValue,
const int numImages) {
const int caseIdx = blockIdx.x * 4 + threadIdx.x;
const int pxIdx = blockIdx.y * 128 + threadIdx.y;
......@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
const int pxY = pxIdx / tgtSize;
int srcPxX, srcPxY;
getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter,
imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX,
getTranformCoord(pxX,
pxY,
thetas[imgIdx],
scales[imgIdx],
tgtCenter,
imgCenter,
centerCs[caseIdx],
centerRs[caseIdx],
&srcPxX,
&srcPxY);
imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
......@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
*
* created by Wei Xu
*/
void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
int*& gpuCenterR, int*& gpuCenterC,
int numImages, int imgSize, real rotateAngle,
real scaleRatio, int samplingRate,
void hl_generate_disturb_params(real*& gpuAngle,
real*& gpuScaleRatio,
int*& gpuCenterR,
int*& gpuCenterC,
int numImages,
int imgSize,
real rotateAngle,
real scaleRatio,
int samplingRate,
bool isTrain) {
// The number of output samples.
int numPatches = numImages * samplingRate;
......@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
for (int i = 0; i < numImages; i++) {
r_angle[i] =
(rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT
- 0.5);
-
0.5);
s_ratio[i] =
1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT
}
......@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
int pxY =
(int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT
const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]),
sin(-r_angle[i]), cos(-r_angle[i])};
const real H[4] = {cos(-r_angle[i]),
-sin(-r_angle[i]),
sin(-r_angle[i]),
cos(-r_angle[i])};
real x = pxX - imgCenter;
real y = pxY - imgCenter;
real xx = H[0] * x + H[1] * y;
......@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
delete[] center_c;
}
void hl_conv_random_disturb_with_params(const real* images, int imgSize,
int tgtSize, int channels,
int numImages, int samplingRate,
void hl_conv_random_disturb_with_params(const real* images,
int imgSize,
int tgtSize,
int channels,
int numImages,
int samplingRate,
const real* gpuRotationAngle,
const real* gpuScaleRatio,
const int* gpuCenterR,
......@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
dim3 threadsPerBlock(4, 128);
dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
kSamplingPatches <<<numBlocks, threadsPerBlock>>>
(images, target, imgSize, tgtSize, channels, samplingRate,
gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC,
paddingValue, numImages);
kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
target,
imgSize,
tgtSize,
channels,
samplingRate,
gpuRotationAngle,
gpuScaleRatio,
gpuCenterR,
gpuCenterC,
paddingValue,
numImages);
hl_device_synchronize();
}
void hl_conv_random_disturb(const real* images, int imgSize,
int tgtSize, int channels, int numImages,
real scaleRatio, real rotateAngle,
int samplingRate, real* gpu_r_angle,
real* gpu_s_ratio, int* gpu_center_r,
int* gpu_center_c, int paddingValue,
bool isTrain, real* targets) {
void hl_conv_random_disturb(const real* images,
int imgSize,
int tgtSize,
int channels,
int numImages,
real scaleRatio,
real rotateAngle,
int samplingRate,
real* gpu_r_angle,
real* gpu_s_ratio,
int* gpu_center_r,
int* gpu_center_c,
int paddingValue,
bool isTrain,
real* targets) {
// generate the random disturbance sequence and the sampling locations
hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r,
gpu_center_c, numImages, imgSize, rotateAngle,
scaleRatio, samplingRate, isTrain);
hl_conv_random_disturb_with_params(
images, imgSize, tgtSize, channels, numImages,
samplingRate, gpu_r_angle, gpu_s_ratio,
gpu_center_r, gpu_center_r, paddingValue,
hl_generate_disturb_params(gpu_r_angle,
gpu_s_ratio,
gpu_center_r,
gpu_center_c,
numImages,
imgSize,
rotateAngle,
scaleRatio,
samplingRate,
isTrain);
hl_conv_random_disturb_with_params(images,
imgSize,
tgtSize,
channels,
numImages,
samplingRate,
gpu_r_angle,
gpu_s_ratio,
gpu_center_r,
gpu_center_r,
paddingValue,
targets);
}
......@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_cuda.h"
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h"
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output, int ldo,
real* table, int ldt,
template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output,
int ldo,
real* table,
int ldt,
int* ids,
int numSamples,
int tableSize,
......@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
while (idy < numSamples) {
int tableId = ids[idy];
if ((0 <= tableId) && (tableId < tableSize)) {
real *out = output + idy * ldo;
real *tab = table + tableId * ldt;
real* out = output + idy * ldo;
real* tab = table + tableId * ldt;
for (int i = idx; i < dim; i += blockDimX) {
if (AddRow) {
paddle::paddleAtomicAdd(&tab[i], out[i]);
......@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
}
}
void hl_matrix_select_rows(real* output, int ldo,
real* table, int ldt,
void hl_matrix_select_rows(real* output,
int ldo,
real* table,
int ldt,
int* ids,
int numSamples,
int tableSize,
......@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
dim3 threads(128, 8);
dim3 grid(8, 1);
KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
(output, ldo, table, ldt, ids, numSamples, tableSize, dim);
KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
output, ldo, table, ldt, ids, numSamples, tableSize, dim);
CHECK_SYNC("hl_matrix_select_rows failed");
}
void hl_matrix_add_to_rows(real* table, int ldt,
real* input, int ldi,
void hl_matrix_add_to_rows(real* table,
int ldt,
real* input,
int ldi,
int* ids,
int numSamples,
int tableSize,
......@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
dim3 threads(128, 8);
dim3 grid(8, 1);
KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
(input, ldi, table, ldt, ids, numSamples, tableSize, dim);
KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
input, ldi, table, ldt, ids, numSamples, tableSize, dim);
CHECK_SYNC("hl_matrix_add_to_rows failed");
}
template<class T, int blockDimX, int gridDimX>
__global__ void KeVectorSelect(T* dst, int sized,
const T* src, int sizes,
const int* ids, int sizei) {
template <class T, int blockDimX, int gridDimX>
__global__ void KeVectorSelect(
T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
int idx = threadIdx.x + blockDimX * blockIdx.x;
while (idx < sizei) {
int index = ids[idx];
......@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
}
template <class T>
void hl_vector_select_from(T* dst, int sized,
const T* src, int sizes,
const int* ids, int sizei) {
void hl_vector_select_from(
T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
CHECK_NOTNULL(dst);
CHECK_NOTNULL(src);
CHECK_NOTNULL(ids);
......@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
dim3 threads(512, 1);
dim3 grid(8, 1);
KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>>
(dst, sized, src, sizes, ids, sizei);
KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
dst, sized, src, sizes, ids, sizei);
CHECK_SYNC("hl_vector_select_from failed");
}
template
void hl_vector_select_from(real* dst, int sized,
const real* src, int sizes,
const int* ids, int sizei);
template
void hl_vector_select_from(int* dst, int sized,
const int* src, int sizes,
const int* ids, int sizei);
template void hl_vector_select_from(real* dst,
int sized,
const real* src,
int sizes,
const int* ids,
int sizei);
template void hl_vector_select_from(
int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
......@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_base.h"
#include "hl_top_k.h"
#include "hl_sparse.ph"
#include "hl_top_k.h"
#include "paddle/utils/Logging.h"
// using namespace hppl;
struct Pair {
__device__ __forceinline__
Pair() {}
__device__ __forceinline__ Pair() {}
__device__ __forceinline__
Pair(real value, int id) : v_(value), id_(id) {}
__device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
__device__ __forceinline__
void set(real value, int id) {
__device__ __forceinline__ void set(real value, int id) {
v_ = value;
id_ = id;
}
__device__ __forceinline__
void operator=(const Pair& in) {
__device__ __forceinline__ void operator=(const Pair& in) {
v_ = in.v_;
id_ = in.id_;
}
__device__ __forceinline__
bool operator<(const real value) const {
__device__ __forceinline__ bool operator<(const real value) const {
return (v_ < value);
}
__device__ __forceinline__
bool operator<(const Pair& in) const {
__device__ __forceinline__ bool operator<(const Pair& in) const {
return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
}
__device__ __forceinline__
bool operator>(const Pair& in) const {
__device__ __forceinline__ bool operator>(const Pair& in) const {
return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
}
......@@ -58,8 +50,9 @@ struct Pair {
int id_;
};
__device__ __forceinline__
void addTo(Pair topK[], const Pair &p, int beamSize) {
__device__ __forceinline__ void addTo(Pair topK[],
const Pair& p,
int beamSize) {
for (int k = beamSize - 2; k >= 0; k--) {
if (topK[k] < p) {
topK[k + 1] = topK[k];
......@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
topK[0] = p;
}
template<int beamSize>
__device__ __forceinline__
void addTo(Pair topK[], const Pair &p) {
template <int beamSize>
__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
for (int k = beamSize - 2; k >= 0; k--) {
if (topK[k] < p) {
topK[k + 1] = topK[k];
......@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
topK[0] = p;
}
template<int blockSize>
__device__ __forceinline__
void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
template <int blockSize>
__device__ __forceinline__ void getTopK(
Pair topK[], real* src, int idx, int dim, int beamSize) {
while (idx < dim) {
if (topK[beamSize - 1] < src[idx]) {
Pair tmp(src[idx], idx);
......@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
}
}
template<int blockSize>
__device__ __forceinline__
void getTopK(Pair topK[], real *src, int idx, int dim,
const Pair& max, int beamSize) {
template <int blockSize>
__device__ __forceinline__ void getTopK(
Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
while (idx < dim) {
if (topK[beamSize - 1] < src[idx]) {
Pair tmp(src[idx], idx);
......@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
}
}
template<int blockSize>
__device__ __forceinline__
void getTopK(Pair topK[], real *val, int *col,
int idx, int dim, int beamSize) {
template <int blockSize>
__device__ __forceinline__ void getTopK(
Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
while (idx < dim) {
if (topK[beamSize - 1] < val[idx]) {
Pair tmp(val[idx], col[idx]);
......@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
}
}
template<int blockSize>
__device__ __forceinline__
void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
const Pair& max, int beamSize) {
template <int blockSize>
__device__ __forceinline__ void getTopK(Pair topK[],
real* val,
int* col,
int idx,
int dim,
const Pair& max,
int beamSize) {
while (idx < dim) {
if (topK[beamSize - 1] < val[idx]) {
Pair tmp(val[idx], col[idx]);
......@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
}
}
template<int maxLength, int blockSize>
__device__ __forceinline__
void threadGetTopK(Pair topK[], int& beam, int beamSize,
template <int maxLength, int blockSize>
__device__ __forceinline__ void threadGetTopK(Pair topK[],
int& beam,
int beamSize,
real* src,
bool& firstStep, bool& isEmpty, Pair& max,
int dim, const int tid) {
bool& firstStep,
bool& isEmpty,
Pair& max,
int dim,
const int tid) {
if (beam > 0) {
int length = beam < beamSize ? beam : beamSize;
if (firstStep) {
......@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
if (!isEmpty) {
getTopK<blockSize>(topK + maxLength - beam, src, tid, dim,
max, length);
getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
}
}
......@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
template<int maxLength, int blockSize>
__device__ __forceinline__
void threadGetTopK(Pair topK[], int& beam, int beamSize,
real* val, int* col,
bool& firstStep, bool& isEmpty, Pair& max,
int dim, const int tid) {
template <int maxLength, int blockSize>
__device__ __forceinline__ void threadGetTopK(Pair topK[],
int& beam,
int beamSize,
real* val,
int* col,
bool& firstStep,
bool& isEmpty,
Pair& max,
int dim,
const int tid) {
if (beam > 0) {
int length = beam < beamSize ? beam : beamSize;
if (firstStep) {
......@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
if (!isEmpty) {
getTopK<blockSize>(topK + maxLength - beam, val, col, tid, dim,
max, length);
getTopK<blockSize>(
topK + maxLength - beam, val, col, tid, dim, max, length);
}
}
......@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
template<int maxLength, int blockSize>
__device__ __forceinline__
void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
real** topVal, int** topIds,
int& beam, int& beamSize,
const int tid, const int warp) {
template <int maxLength, int blockSize>
__device__ __forceinline__ void blockReduce(Pair* shTopK,
int* maxId,
Pair topK[],
real** topVal,
int** topIds,
int& beam,
int& beamSize,
const int tid,
const int warp) {
while (true) {
__syncthreads();
if (tid < blockSize / 2) {
......@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
}
}
__syncthreads();
for (int stride = blockSize / 4; stride > 0; stride = stride/2) {
for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
if (tid < stride) {
if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
maxId[tid] = maxId[tid + stride];
......@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
* 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value.
*/
template<int maxLength, int blockSize>
__global__ void KeMatrixTopK(real* topVal, int ldv,
int * topIds,
real* src, int lds,
template <int maxLength, int blockSize>
__global__ void KeMatrixTopK(real* topVal,
int ldv,
int* topIds,
real* src,
int lds,
int dim,
int beamSize) {
__shared__ Pair shTopK[blockSize];
......@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
topK[k].set(-HL_FLOAT_MAX, -1);
}
while (beamSize) {
threadGetTopK<maxLength, blockSize>
(topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
threadGetTopK<maxLength, blockSize>(
topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize>
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
blockReduce<maxLength, blockSize>(
shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
}
}
template<int maxLength, int blockSize>
__global__ void KeSMatrixTopK(real* topVal, int ldv,
int * topIds,
template <int maxLength, int blockSize>
__global__ void KeSMatrixTopK(real* topVal,
int ldv,
int* topIds,
real* val,
int* row,
int* col,
......@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
topK[k].set(-HL_FLOAT_MAX, -1);
}
while (beamSize) {
threadGetTopK<maxLength, blockSize>
(topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
threadGetTopK<maxLength, blockSize>(
topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize>
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
blockReduce<maxLength, blockSize>(
shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
}
}
void hl_matrix_top_k(real* topVal, int ldv,
int * topIds,
real* src, int lds,
void hl_matrix_top_k(real* topVal,
int ldv,
int* topIds,
real* src,
int lds,
int dim,
int beamSize,
int numSamples) {
......@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
dim3 threads(256, 1);
dim3 grid(numSamples, 1);
KeMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
(topVal, ldv, topIds, src, lds, dim, beamSize);
KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
topVal, ldv, topIds, src, lds, dim, beamSize);
CHECK_SYNC("hl_matrix_top_k failed");
}
void hl_sparse_matrix_top_k(real* topVal, int ldv,
int * topIds,
void hl_sparse_matrix_top_k(real* topVal,
int ldv,
int* topIds,
hl_sparse_matrix_s src,
int beamSize,
int numSamples) {
CHECK_NOTNULL(topVal);
CHECK_NOTNULL(topIds);
CHECK_NOTNULL(src);
CHECK_EQ(src->format, HL_SPARSE_CSR)
<<"sparse matrix format error!";
CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
if (csr->csr_val == NULL || csr->csr_row == NULL ||
csr->csr_col == NULL) {
if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
LOG(FATAL) << "parameter src is null!";
}
dim3 threads(256, 1);
dim3 grid(numSamples, 1);
KeSMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
(topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
CHECK_SYNC("hl_sparse_matrix_top_k failed");
}
......@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
* 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value.
*/
template<int maxLength, int blockSize>
__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
int * topIds,
real* src, int lds,
template <int maxLength, int blockSize>
__global__ void KeMatrixTopKClassificationError(real* topVal,
int ldv,
int* topIds,
real* src,
int lds,
int dim,
int beamSize,
int* label,
......@@ -420,12 +432,12 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
}
while (beamSize) {
threadGetTopK<maxLength, blockSize>
(topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
threadGetTopK<maxLength, blockSize>(
topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize>
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
blockReduce<maxLength, blockSize>(
shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
}
__syncthreads();
......@@ -440,9 +452,11 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
}
}
void hl_matrix_classification_error(real* topVal, int ldv,
void hl_matrix_classification_error(real* topVal,
int ldv,
int* topIds,
real* src, int lds,
real* src,
int lds,
int dim,
int topkSize,
int numSamples,
......@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
dim3 threads(256, 1);
dim3 grid(numSamples, 1);
KeMatrixTopKClassificationError<5, 256>
<<< grid, threads, 0, STREAM_DEFAULT >>>
(topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
CHECK_SYNC("hl_matrix_top_k classification error failed");
}
......@@ -12,13 +12,15 @@ cc_test(variable_test SRCS variable_test.cc)
cc_library(scope SRCS scope.cc)
cc_test(scope_test SRCS scope_test.cc DEPS scope)
proto_library(attr_type SRCS attr_type.proto)
proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
proto_library(attribute_proto SRCS attribute.proto)
proto_library(op_proto SRCS op_proto.proto DEPS attribute_proto)
proto_library(op_desc SRCS op_desc.proto DEPS attribute_proto)
cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope)
cc_library(attribute SRCS attribute.cc DEPS op_desc op_proto)
cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope attribute)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator)
......@@ -26,7 +28,7 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder)
cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
py_proto_compile(framework_py_proto SRCS attribute.proto op_proto.proto op_desc.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/attribute.h"
#include <vector>
namespace paddle {
namespace framework {
template <>
AttrType AttrTypeID<int>() {
return INT;
}
template <>
AttrType AttrTypeID<float>() {
return FLOAT;
}
template <>
AttrType AttrTypeID<std::string>() {
return STRING;
}
template <>
AttrType AttrTypeID<std::vector<int>>() {
return INTS;
}
template <>
AttrType AttrTypeID<std::vector<float>>() {
return FLOATS;
}
template <>
AttrType AttrTypeID<std::vector<std::string>>() {
return STRINGS;
}
Attribute GetAttrValue(const AttrDesc& attr_desc) {
switch (attr_desc.type()) {
case paddle::framework::AttrType::INT: {
return attr_desc.i();
}
case paddle::framework::AttrType::FLOAT: {
return attr_desc.f();
}
case paddle::framework::AttrType::STRING: {
return attr_desc.s();
}
case paddle::framework::AttrType::INTS: {
std::vector<int> val(attr_desc.ints_size());
for (int i = 0; i < attr_desc.ints_size(); ++i) {
val[i] = attr_desc.ints(i);
}
return val;
}
case paddle::framework::AttrType::FLOATS: {
std::vector<float> val(attr_desc.floats_size());
for (int i = 0; i < attr_desc.floats_size(); ++i) {
val[i] = attr_desc.floats(i);
}
return val;
}
case paddle::framework::AttrType::STRINGS: {
std::vector<std::string> val(attr_desc.strings_size());
for (int i = 0; i < attr_desc.strings_size(); ++i) {
val[i] = attr_desc.strings(i);
}
return val;
}
}
PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
return boost::blank();
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <boost/variant.hpp>
......@@ -6,6 +20,9 @@
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/framework/attribute.pb.h"
#include "paddle/framework/op_desc.pb.h"
#include "paddle/platform/enforce.h"
namespace paddle {
......@@ -14,13 +31,19 @@ namespace framework {
typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
std::vector<float>, std::vector<std::string>>
Attribute;
typedef std::unordered_map<std::string, Attribute> AttributeMap;
template <typename T>
AttrType AttrTypeID();
Attribute GetAttrValue(const AttrDesc& attr_desc);
// check whether a value(attribute) fit a certain limit
template <typename T>
class LargerThanChecker {
public:
LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const {
PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
}
......@@ -35,7 +58,8 @@ class LargerThanChecker {
template <typename T>
class DefaultValueSetter {
public:
DefaultValueSetter(T default_value) : default_value_(default_value) {}
explicit DefaultValueSetter(T default_value)
: default_value_(default_value) {}
void operator()(T& value) const { value = default_value_; }
private:
......@@ -78,7 +102,8 @@ class TypedAttrChecker {
typedef std::function<void(T&)> ValueChecker;
public:
TypedAttrChecker(const std::string& attr_name) : attr_name_(attr_name) {}
explicit TypedAttrChecker(const std::string& attr_name)
: attr_name_(attr_name) {}
TypedAttrChecker& InEnum(const std::unordered_set<T>& range) {
value_checkers_.push_back(EnumInContainer<T>(range));
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax="proto2";
syntax = "proto2";
package paddle.framework;
// Attribute Type for paddle's Op.
......
......@@ -59,19 +59,17 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
// If all input gradients of forwarding operator do not need to calculate,
// just return an NOP. Not return null ptr because NOP does not take
// too much time for calculation, but it is useful for simplifying logic.
if (AllInSet(forwardOp.inputs_, OperatorBase::GRAD_VAR_SUFFIX(),
no_grad_names)) {
if (AllInSet(forwardOp.inputs_, kGradVarSuffix, no_grad_names)) {
return NOP();
}
// All output gradients of forwarding operator do not need to calculate.
// Then all input gradients cannot be computed at all, and we put them into
// `no_grad_names` set. Return an NOP.
if (AllInSet(forwardOp.outputs_, OperatorBase::GRAD_VAR_SUFFIX(),
no_grad_names)) {
if (AllInSet(forwardOp.outputs_, kGradVarSuffix, no_grad_names)) {
for (auto& name : forwardOp.inputs_) {
// Mark all input is not need
no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX());
no_grad_names.insert(name + kGradVarSuffix);
}
return NOP();
}
......@@ -134,9 +132,9 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
for (std::string& grad_input : grad_op->inputs_) {
if (no_grad_names.count(grad_input)) {
std::string prefix = grad_input.substr(
0, grad_input.size() - OperatorBase::GRAD_VAR_SUFFIX().size());
grad_input = prefix + OperatorBase::ZERO_VAR_SUFFIX();
std::string prefix =
grad_input.substr(0, grad_input.size() - kGradVarSuffix.size());
grad_input = prefix + kZeroVarSuffix;
// If part of input gradient of that operator is not calculated, fill
// zero variables to that input gradient.
......@@ -147,7 +145,7 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
for (std::string& grad_output : grad_op->outputs_) {
if (no_grad_names.count(grad_output)) {
grad_output = OperatorBase::EMPTY_VAR_NAME();
grad_output = kEmptyVarName;
}
}
......@@ -168,14 +166,14 @@ std::shared_ptr<OperatorBase> Backward(
std::unordered_set<std::string> no_grad_names;
no_grad_names.reserve(no_grad_vars.size());
no_grad_names.insert(OperatorBase::EMPTY_VAR_NAME() +
OperatorBase::GRAD_VAR_SUFFIX());
no_grad_names.insert(kEmptyVarName + kGradVarSuffix);
for (auto& name : no_grad_vars) {
no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX());
no_grad_names.insert(name + kGradVarSuffix);
}
size_t uid = 0;
return BackwardRecursive(forwardOp, no_grad_names, uid);
}
} // namespace framework
} // namespace paddle
......@@ -78,14 +78,14 @@ class FcOp : public ops::NetOp {
{Output("mul_result")}, {}));
auto b_name = Input("b");
std::string before_act = "mul_result";
if (b_name != EMPTY_VAR_NAME()) {
if (b_name != kEmptyVarName) {
AddOp(OpRegistry::CreateOp("rowwise_add", {Output("mul_result"), b_name},
{Output("add_result")}, {}));
before_act = "add_result";
} else {
auto out_varname = Output("add_result");
if (out_varname != EMPTY_VAR_NAME()) {
this->Rename(out_varname, EMPTY_VAR_NAME());
if (out_varname != kEmptyVarName) {
this->Rename(out_varname, kEmptyVarName);
}
}
......@@ -163,13 +163,12 @@ TEST(Backward, simple_op_grad) {
ASSERT_NE(fwd, nullptr);
auto gop = f::OpRegistry::CreateGradOp(*fwd);
ASSERT_EQ(4UL, gop->inputs_.size());
ASSERT_EQ(f::OperatorBase::EMPTY_VAR_NAME(), gop->inputs_[0]);
ASSERT_EQ(f::kEmptyVarName, gop->inputs_[0]);
ASSERT_EQ("rowwise_add_grad", gop->type_);
ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->outputs_[0]);
ASSERT_EQ("b" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->outputs_[1]);
ASSERT_EQ("X" + f::kGradVarSuffix, gop->outputs_[0]);
ASSERT_EQ("b" + f::kGradVarSuffix, gop->outputs_[1]);
ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(),
gop->Output("X" + f::OperatorBase::GRAD_VAR_SUFFIX()));
ASSERT_EQ("X" + f::kGradVarSuffix, gop->Output("X" + f::kGradVarSuffix));
}
TEST(Backward, simple_op_not_need_grad) {
......@@ -177,7 +176,7 @@ TEST(Backward, simple_op_not_need_grad) {
ASSERT_NE(fwd, nullptr);
auto gop = f::Backward(*fwd, {"X"});
ASSERT_EQ(std::find(gop->outputs_.begin(), gop->outputs_.end(),
"X" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"X" + f::kGradVarSuffix),
gop->outputs_.end());
auto no_input_gop = f::Backward(*fwd, {"X", "b"});
......@@ -210,8 +209,8 @@ TEST(Backward, net_fc_backward_normal) {
}
TEST(Backward, net_fc_backward_not_have_b) {
std::shared_ptr<f::OperatorBase> fwd = f::OpRegistry::CreateOp(
"fc", {"X", "w", f::OperatorBase::EMPTY_VAR_NAME()},
std::shared_ptr<f::OperatorBase> fwd =
f::OpRegistry::CreateOp("fc", {"X", "w", f::kEmptyVarName},
{"mul_result", "add_result", "tmp"}, {});
ASSERT_NE(fwd, nullptr);
std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
......@@ -242,24 +241,21 @@ TEST(Backward, net_input_of_network_not_need_grad) {
std::unordered_set<std::string> all_output = std::unordered_set<std::string>(
bwd_net->outputs_.begin(), bwd_net->outputs_.end());
all_output.erase(f::OperatorBase::EMPTY_VAR_NAME());
all_output.erase(f::kEmptyVarName);
for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
ASSERT_NE(all_output.find(out + f::OperatorBase::GRAD_VAR_SUFFIX()),
all_output.end());
ASSERT_NE(all_output.find(out + f::kGradVarSuffix), all_output.end());
}
// Not Generated X
ASSERT_EQ(all_output.find("X" + f::OperatorBase::GRAD_VAR_SUFFIX()),
all_output.end());
ASSERT_EQ(all_output.find("X" + f::kGradVarSuffix), all_output.end());
ASSERT_EQ(2UL, bwd_net->ops_.size());
ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
ASSERT_EQ(3UL, first_fc_grad->ops_.size());
ASSERT_EQ(
f::OperatorBase::EMPTY_VAR_NAME(),
first_fc_grad->ops_[2]->Output("A" + f::OperatorBase::GRAD_VAR_SUFFIX()));
ASSERT_EQ(f::kEmptyVarName,
first_fc_grad->ops_[2]->Output("A" + f::kGradVarSuffix));
}
TEST(Backward, net_shared_weight) {
......@@ -311,17 +307,15 @@ TEST(Backward, op_part_of_output_are_not_need) {
ASSERT_EQ(1UL, fill_zero.inputs_.size());
ASSERT_EQ("Z", fill_zero.inputs_[0]);
ASSERT_EQ(1UL, fill_zero.outputs_.size());
ASSERT_EQ("Z" + f::OperatorBase::ZERO_VAR_SUFFIX(), fill_zero.outputs_[0]);
ASSERT_EQ("Z" + f::kZeroVarSuffix, fill_zero.outputs_[0]);
auto &d_many_out = *net->ops_[1];
ASSERT_EQ("many_output_op_grad", d_many_out.type_);
ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.inputs_.size()); // I/O/OG
ASSERT_EQ("Z" + f::OperatorBase::ZERO_VAR_SUFFIX(),
d_many_out.Input("z" + f::OperatorBase::GRAD_VAR_SUFFIX()));
ASSERT_EQ("Y" + f::OperatorBase::GRAD_VAR_SUFFIX(),
d_many_out.Input("y" + f::OperatorBase::GRAD_VAR_SUFFIX()));
ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(),
d_many_out.Output("x" + f::OperatorBase::GRAD_VAR_SUFFIX()));
ASSERT_EQ("Z" + f::kZeroVarSuffix, d_many_out.Input("z" + f::kGradVarSuffix));
ASSERT_EQ("Y" + f::kGradVarSuffix, d_many_out.Input("y" + f::kGradVarSuffix));
ASSERT_EQ("X" + f::kGradVarSuffix,
d_many_out.Output("x" + f::kGradVarSuffix));
}
TEST(Backward, op_part_of_input_are_not_need) {
......@@ -331,12 +325,10 @@ TEST(Backward, op_part_of_input_are_not_need) {
ASSERT_EQ(grad_mul.type_, "mul_grad");
ASSERT_EQ(grad_mul.inputs_.size(), 2UL + 1UL + 1UL);
ASSERT_EQ(grad_mul.outputs_.size(), 2UL);
ASSERT_EQ(grad_mul.Output("A" + f::OperatorBase::GRAD_VAR_SUFFIX()),
f::OperatorBase::EMPTY_VAR_NAME());
ASSERT_EQ(grad_mul.Output("B" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"b" + f::OperatorBase::GRAD_VAR_SUFFIX());
ASSERT_EQ(grad_mul.Input("Out" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"out" + f::OperatorBase::GRAD_VAR_SUFFIX());
ASSERT_EQ(grad_mul.Output("A" + f::kGradVarSuffix), f::kEmptyVarName);
ASSERT_EQ(grad_mul.Output("B" + f::kGradVarSuffix), "b" + f::kGradVarSuffix);
ASSERT_EQ(grad_mul.Input("Out" + f::kGradVarSuffix),
"out" + f::kGradVarSuffix);
ASSERT_EQ(grad_mul.Input("A"), "a");
ASSERT_EQ(grad_mul.Input("B"), "b");
ASSERT_EQ(grad_mul.Input("Out"), "out");
......@@ -368,23 +360,4 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
EXPECT_EQ(bwd_net->ops_[1]->outputs_.size(), 0UL);
EXPECT_EQ(bwd_net->ops_[2]->inputs_.size(), 0UL);
EXPECT_EQ(bwd_net->ops_[2]->outputs_.size(), 0UL);
/*
EXPECT_EQ(grad_fc.Output("X" + f::OperatorBase::GRAD_VAR_SUFFIX()),
f::OperatorBase::EMPTY_VAR_NAME());
EXPECT_EQ(grad_fc.Output("W" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"w3" + f::OperatorBase::GRAD_VAR_SUFFIX());
EXPECT_EQ(grad_fc.Output("b" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"b3" + f::OperatorBase::GRAD_VAR_SUFFIX());
EXPECT_EQ(grad_fc.Output("mul_result" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"mul_out3" + f::OperatorBase::GRAD_VAR_SUFFIX());
EXPECT_EQ(grad_fc.Input("Out" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"out3" + f::OperatorBase::GRAD_VAR_SUFFIX());
EXPECT_EQ(grad_fc.Input("X"), "out2");
EXPECT_EQ(grad_fc.Input("W"), "w3");
EXPECT_EQ(grad_fc.Input("mul_result"), "mul_out3");
EXPECT_EQ(grad_fc.Input("add_result"), "tmp_out3");
EXPECT_EQ(grad_fc.Input("Out"), "out3");
*/
}
......@@ -56,8 +56,7 @@ static void TransOpArg(const OperatorBase* src_op, OperatorBase* dst_op,
for (const auto& arg : src_arg_list) {
std::string src_name = arg.name();
std::string dst_name =
is_grad ? src_name + OperatorBase::GRAD_VAR_SUFFIX() : src_name;
std::string dst_name = is_grad ? src_name + kGradVarSuffix : src_name;
(*dst_op->in_out_idxs_)[dst_name] = idx++;
int src_arg_idx = src_op->in_out_idxs_->at(src_name);
int src_begin =
......@@ -65,10 +64,9 @@ static void TransOpArg(const OperatorBase* src_op, OperatorBase* dst_op,
int src_end = src_format == nullptr ? src_arg_idx + 1
: src_format->at(src_arg_idx + 1);
for (int i = src_begin; i < src_end; ++i) {
std::string s = is_grad ? src_inout[i] + OperatorBase::GRAD_VAR_SUFFIX()
: arg.ignore_gradient()
? OperatorBase::EMPTY_VAR_NAME()
: src_inout[i];
std::string s =
is_grad ? src_inout[i] + kGradVarSuffix
: (arg.ignore_gradient() ? kEmptyVarName : src_inout[i]);
dst_inout.emplace_back(s);
}
if (dst_format != nullptr) {
......
......@@ -83,24 +83,21 @@ TEST(GradOpBuilder, MutiInOut) {
EXPECT_EQ(grad_test_op->Input("Out1"), "out1");
EXPECT_EQ(grad_test_op->Inputs("Out2_mult"),
std::vector<std::string>({"out2_1", "out2_2"}));
EXPECT_EQ(grad_test_op->Input("Out1" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"out1" + f::OperatorBase::GRAD_VAR_SUFFIX());
EXPECT_EQ(
grad_test_op->Inputs("Out2_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()),
EXPECT_EQ(grad_test_op->Input("Out1" + f::kGradVarSuffix),
"out1" + f::kGradVarSuffix);
EXPECT_EQ(grad_test_op->Inputs("Out2_mult" + f::kGradVarSuffix),
std::vector<std::string>(
{"out2_1" + f::OperatorBase::GRAD_VAR_SUFFIX(),
"out2_2" + f::OperatorBase::GRAD_VAR_SUFFIX()}));
{"out2_1" + f::kGradVarSuffix, "out2_2" + f::kGradVarSuffix}));
ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
EXPECT_EQ(grad_test_op->Output("In1" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"in1" + f::OperatorBase::GRAD_VAR_SUFFIX());
EXPECT_EQ(
grad_test_op->Outputs("In2_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()),
std::vector<std::string>({"in2_1" + f::OperatorBase::GRAD_VAR_SUFFIX(),
"in2_2" + f::OperatorBase::GRAD_VAR_SUFFIX(),
"in2_3" + f::OperatorBase::GRAD_VAR_SUFFIX()}));
EXPECT_EQ(grad_test_op->Output("In3" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"in3" + f::OperatorBase::GRAD_VAR_SUFFIX());
EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix),
"in1" + f::kGradVarSuffix);
EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix),
std::vector<std::string>({"in2_1" + f::kGradVarSuffix,
"in2_2" + f::kGradVarSuffix,
"in2_3" + f::kGradVarSuffix}));
EXPECT_EQ(grad_test_op->Output("In3" + f::kGradVarSuffix),
"in3" + f::kGradVarSuffix);
}
TEST(GradOpBuilder, IOIgnoredInGradient) {
......@@ -116,30 +113,25 @@ TEST(GradOpBuilder, IOIgnoredInGradient) {
ASSERT_EQ(grad_test_op->inputs_.size(), 5UL + 3UL + 3UL);
EXPECT_EQ(grad_test_op->Input("In1"), "in1");
EXPECT_EQ(grad_test_op->Inputs("In2_mult"),
std::vector<std::string>({f::OperatorBase::EMPTY_VAR_NAME(),
f::OperatorBase::EMPTY_VAR_NAME()}));
std::vector<std::string>({f::kEmptyVarName, f::kEmptyVarName}));
EXPECT_EQ(grad_test_op->Inputs("In3_mult"),
std::vector<std::string>({"in3_1", "in3_2"}));
EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
std::vector<std::string>({"out1_1", "out1_2"}));
EXPECT_EQ(grad_test_op->Input("Out2"), f::OperatorBase::EMPTY_VAR_NAME());
EXPECT_EQ(
grad_test_op->Inputs("Out1_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()),
EXPECT_EQ(grad_test_op->Input("Out2"), f::kEmptyVarName);
EXPECT_EQ(grad_test_op->Inputs("Out1_mult" + f::kGradVarSuffix),
std::vector<std::string>(
{"out1_1" + f::OperatorBase::GRAD_VAR_SUFFIX(),
"out1_2" + f::OperatorBase::GRAD_VAR_SUFFIX()}));
EXPECT_EQ(grad_test_op->Input("Out2" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"out2" + f::OperatorBase::GRAD_VAR_SUFFIX());
{"out1_1" + f::kGradVarSuffix, "out1_2" + f::kGradVarSuffix}));
EXPECT_EQ(grad_test_op->Input("Out2" + f::kGradVarSuffix),
"out2" + f::kGradVarSuffix);
ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
EXPECT_EQ(grad_test_op->Output("In1" + f::OperatorBase::GRAD_VAR_SUFFIX()),
"in1" + f::OperatorBase::GRAD_VAR_SUFFIX());
EXPECT_EQ(
grad_test_op->Outputs("In2_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()),
std::vector<std::string>({"in2_1" + f::OperatorBase::GRAD_VAR_SUFFIX(),
"in2_2" + f::OperatorBase::GRAD_VAR_SUFFIX()}));
EXPECT_EQ(
grad_test_op->Outputs("In3_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()),
std::vector<std::string>({"in3_1" + f::OperatorBase::GRAD_VAR_SUFFIX(),
"in3_2" + f::OperatorBase::GRAD_VAR_SUFFIX()}));
EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix),
"in1" + f::kGradVarSuffix);
EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix),
std::vector<std::string>(
{"in2_1" + f::kGradVarSuffix, "in2_2" + f::kGradVarSuffix}));
EXPECT_EQ(grad_test_op->Outputs("In3_mult" + f::kGradVarSuffix),
std::vector<std::string>(
{"in3_1" + f::kGradVarSuffix, "in3_2" + f::kGradVarSuffix}));
}
......@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax="proto2";
syntax = "proto2";
package paddle.framework;
import "attr_type.proto";
import "attribute.proto";
// AttrDesc is used to describe Attributes of an Operator. It contain's
// name, type, and value of Attribute.
......
......@@ -15,13 +15,14 @@ limitations under the License. */
// Protocol Message for 3rd-party language binding.
//
// Paddle Python package will use `OpProto` to generate op creation methods.
// The op creation methods take user's input and generate `OpDesc` proto message,
// The op creation methods take user's input and generate `OpDesc` proto
// message,
// then pass `OpDesc` to C++ side and create Op pointer.
//
syntax="proto2";
syntax = "proto2";
package paddle.framework;
import "attr_type.proto";
import "attribute.proto";
// Attribute protocol message for 3rd-party language binding.
// It will store the Op support what attribute and what type.
......@@ -32,13 +33,14 @@ message AttrProto {
// Supported attribute type.
required AttrType type = 2;
// Supported attribute comments. It helps 3rd-party language generate doc-string.
// Supported attribute comments. It helps 3rd-party language generate
// doc-string.
required string comment = 3;
// If that attribute is generated, it means the Paddle third language
// binding has responsibility to fill that attribute. End-User should
// not set that attribute.
optional bool generated = 4 [default=false];
optional bool generated = 4 [ default = false ];
}
// Input or output message for 3rd-party language binding.
......@@ -48,7 +50,8 @@ message VarProto {
// e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
required string name = 1;
// The comment for that input. It helps 3rd-party language generate doc-string.
// The comment for that input. It helps 3rd-party language generate
// doc-string.
required string comment = 2;
// Is that input/output could be a list or not.
......@@ -70,7 +73,7 @@ message VarProto {
// }
// }
//
optional bool multiple = 3 [default=false];
optional bool multiple = 3 [ default = false ];
// It marks that output is a temporary output. That output is not used by
// user, but used by other op internally as input. If other op is not use
......@@ -83,7 +86,7 @@ message VarProto {
// attrs = {
// "temporary_index": [1]
// }
optional bool temporary = 4 [default=false];
optional bool temporary = 4 [ default = false ];
// The gradient of operator can be ignored immediately
// e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
......@@ -110,5 +113,4 @@ message OpProto {
// The type of that Op.
required string type = 5;
}
......@@ -14,37 +14,8 @@ limitations under the License. */
#include <paddle/framework/op_registry.h>
namespace paddle {
namespace framework {
template <>
void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::INT);
}
template <>
void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::FLOAT);
}
template <>
void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::STRING);
}
#include <vector>
template <>
void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::INTS);
}
template <>
void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::FLOATS);
}
template <>
void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::STRINGS);
}
} // namespace framework
namespace paddle {
namespace framework {} // namespace framework
} // namespace paddle
......@@ -19,7 +19,7 @@ limitations under the License. */
#include <type_traits>
#include <unordered_map>
#include <unordered_set>
#include "paddle/framework/attr_checker.h"
#include "paddle/framework/attribute.h"
#include "paddle/framework/grad_op_builder.h"
#include "paddle/framework/op_desc.pb.h"
#include "paddle/framework/scope.h"
......@@ -27,49 +27,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
// helper class to set attribute type
struct AttrTypeHelper {
template <typename T>
static void SetAttrType(AttrProto* attr);
static Attribute GetAttrValue(const AttrDesc& attr_desc) {
switch (attr_desc.type()) {
case paddle::framework::AttrType::INT: {
return attr_desc.i();
}
case paddle::framework::AttrType::FLOAT: {
return attr_desc.f();
}
case paddle::framework::AttrType::STRING: {
return attr_desc.s();
}
case paddle::framework::AttrType::INTS: {
std::vector<int> val(attr_desc.ints_size());
for (int i = 0; i < attr_desc.ints_size(); ++i) {
val[i] = attr_desc.ints(i);
}
return val;
}
case paddle::framework::AttrType::FLOATS: {
std::vector<float> val(attr_desc.floats_size());
for (int i = 0; i < attr_desc.floats_size(); ++i) {
val[i] = attr_desc.floats(i);
}
return val;
}
case paddle::framework::AttrType::STRINGS: {
std::vector<std::string> val(attr_desc.strings_size());
for (int i = 0; i < attr_desc.strings_size(); ++i) {
val[i] = attr_desc.strings(i);
}
return val;
}
}
PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
return boost::blank();
}
};
// this class not only make proto but also init attribute checkers.
class OpProtoAndCheckerMaker {
public:
......@@ -136,7 +93,7 @@ class OpProtoAndCheckerMaker {
*attr->mutable_name() = name;
*attr->mutable_comment() = comment;
attr->set_generated(generated);
AttrTypeHelper::SetAttrType<T>(attr);
attr->set_type(AttrTypeID<T>());
return op_checker_->AddAttrChecker<T>(name);
}
......@@ -297,7 +254,7 @@ class OpRegistry {
AttributeMap attrs;
for (auto& attr : op_desc.attrs()) {
attrs[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
attrs[attr.name()] = GetAttrValue(attr);
}
return CreateOp(op_desc.type(), inputs, outputs, attrs);
......@@ -341,7 +298,7 @@ class OpRegistry {
static void GenerateTempVariableName(OperatorBase* op) {
static std::atomic<size_t> gUniqId(0UL);
for (auto& outname : op->outputs_) {
if (outname == OperatorBase::TMP_VAR_NAME()) {
if (outname == kTempVarName) {
outname += op->type_;
outname += "@";
outname += std::to_string(gUniqId.fetch_add(1));
......
......@@ -20,7 +20,7 @@ limitations under the License. */
#include <unordered_map>
#include <vector>
#include "paddle/framework/attr_checker.h"
#include "paddle/framework/attribute.h"
#include "paddle/framework/op_desc.pb.h"
#include "paddle/framework/op_proto.pb.h"
#include "paddle/framework/scope.h"
......@@ -32,9 +32,29 @@ limitations under the License. */
namespace paddle {
namespace framework {
/// If a variable is a empty variable, that name will be used.
const std::string kEmptyVarName = "@EMPTY@";
/// If a variable is a temporary variable, that name will be set in Python,
/// but it will be convert to a unique name in scope after OpCreator.
const std::string kTempVarName = "@TEMP@";
/// If a variable's name has a certain suffix, it means that the
/// variable is the gradient of another varibale.
/// e.g. Variable "x@GRAD" is the gradient of varibale "x".
const std::string kGradVarSuffix = "@GRAD";
/// Variables with this suffix are supposed to be filled up with zeros.
const std::string kZeroVarSuffix = "@ZERO";
inline std::string GradVarName(const std::string& var_name) {
return var_name + kGradVarSuffix;
}
class OperatorBase;
class InferShapeContext;
class ExecutionContext;
/**
* OperatorBase has the basic element that Net will call to do computation.
* Only CreateOperator from OpRegistry will new Operator directly. User
......@@ -43,25 +63,6 @@ class ExecutionContext;
*/
class OperatorBase {
public:
/// If a variable is a empty variable, that name will be used.
static std::string EMPTY_VAR_NAME() { return "@EMPTY@"; }
/// If a variable is a temporary variable, that name will be set in Python,
/// but it will be convert to a unique name in scope after OpCreator.
static std::string TMP_VAR_NAME() { return "@TEMP@"; }
/// If a variable's name has a certain suffix, it means that the
/// variable is the gradient of another varibale.
/// e.g. Variable "x@GRAD" is the gradient of varibale "x".
static std::string GRAD_VAR_SUFFIX() { return "@GRAD"; }
static std::string GRAD_VAR_NAME(const std::string& name) {
return name + GRAD_VAR_SUFFIX();
}
/// Variables with this suffix are supposed to be filled up with zeros.
static std::string ZERO_VAR_SUFFIX() { return "@ZERO"; }
virtual ~OperatorBase() {}
template <typename T>
......
......@@ -157,22 +157,22 @@ class CPUKernalMultiInputsTest : public OpKernel {
ASSERT_EQ(xs[2], "x2");
auto inVar0 = ctx.MultiInputVar("xs");
ASSERT_EQ(inVar0.size(), 3);
ASSERT_EQ(inVar0.size(), 3U);
auto intVar1 = ctx.InputVar("k");
ASSERT_NE(intVar1, nullptr);
auto outVar0 = ctx.MultiOutputVar("ys");
ASSERT_EQ(outVar0.size(), 2);
ASSERT_EQ(outVar0.size(), 2U);
auto inTensor0 = ctx.MultiInput<Tensor>("xs");
ASSERT_EQ(inTensor0.size(), 3);
ASSERT_EQ(inTensor0.size(), 3U);
auto intTensor1 = ctx.Input<Tensor>("k");
ASSERT_NE(intTensor1, nullptr);
auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
ASSERT_EQ(outTensor0.size(), 2);
ASSERT_EQ(outTensor0.size(), 2U);
auto k = ctx.op_.Input("k");
ASSERT_EQ(k, "k0");
......
......@@ -163,8 +163,8 @@ All parameter, weight, gradient are variables in Paddle.
m.def_submodule(
"var_names",
"The module will return special predefined variable name in Paddle")
.def("empty", OperatorBase::EMPTY_VAR_NAME)
.def("temp", OperatorBase::TMP_VAR_NAME);
.def("empty", []() { return kEmptyVarName; })
.def("temp", []() { return kTempVarName; });
// clang-format off
py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
.def_static("create",
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册