Cuda¶
Dynamic Link Libs¶
hl_dso_loader.h¶
Functions
-
void
GetCublasDsoHandle
(void **dso_handle)¶ load the DSO of CUBLAS
- Parameters
**dso_handle
: dso handler
-
void
GetCudnnDsoHandle
(void **dso_handle)¶ load the DSO of CUDNN
- Parameters
**dso_handle
: dso handler
-
void
GetCudartDsoHandle
(void **dso_handle)¶ load the DSO of CUDA Run Time
- Parameters
**dso_handle
: dso handler
-
void
GetCurandDsoHandle
(void **dso_handle)¶ load the DSO of CURAND
- Parameters
**dso_handle
: dso handler
GPU Resources¶
hl_cuda.ph¶
Defines
-
HL_CUDA_PH_
¶
Typedefs
-
typedef struct _global_device_resources *
global_device_resources
¶
-
typedef struct _thread_device_resources *
thread_device_resources
¶
-
typedef struct _hl_device_prop *
hl_device_prop
¶
Functions
-
void
hl_create_thread_resources
(int device, thread_device_resources device_res)¶ thread device resource allocation.
create cuda stream and cuda event, allocate gpu memory and host page-lock memory for threads.
- Parameters
device
: device number.device_res
: device properties.
-
void
hl_create_global_resources
(hl_device_prop device_prop)¶ global device resource allocation.
create cuda stream, initialize cublas, curand and cudnn.
- Parameters
device_prop
: device properties.
-
struct
_global_device_resources
¶ global device resources.
- Parameters
*stream
: device global stream.handle
: devcie cublas handle.gen
: device curand generator.cudnn_handle
: cudnn handle.*gen_mutex
: gen lock.
-
struct
_thread_device_resources
¶
-
struct
_hl_device_prop
¶
hl_cuda.h¶
Typedefs
-
typedef struct _hl_event_st *
hl_event_t
¶ HPPL event.
Functions
-
int
hl_get_cuda_lib_version
()¶ return cuda runtime api version.
-
void
hl_start
()¶ HPPL strat(Initialize all GPU).
-
void
hl_specify_devices_start
(int *device, int number)¶ HPPL start(Initialize the specific GPU).
- Parameters
device
: device id(0, 1......). if device is NULL, will start all GPU.number
: number of devices.
-
bool
hl_device_can_access_peer
(int device, int peerDevice)¶ Queries if a device may directly access a peer device’s memory.
- Return
- Returns true if device is capable of directly accessing memory from peerDevice and false otherwise.
- Parameters
device
: Device from which allocations on peerDevice are to be directly accessed.peerDevice
: Device on which the allocations to be directly accessed by device reside.
-
void
hl_device_enable_peer_access
(int peerDevice)¶ Enables direct access to memory allocations on a peer device.
- Parameters
peerDevice
: Peer device to enable direct access to from the current device
-
void
hl_init
(int device)¶ Init a work thread.
- Parameters
device
: device id.
-
void
hl_fini
()¶ Finish a work thread.
-
void
hl_set_sync_flag
(bool flag)¶ Set synchronous/asynchronous flag.
- Note
- This setting is only valid for the current worker thread.
- Parameters
flag
: true(default), set synchronous flag. false, set asynchronous flag.
-
bool
hl_get_sync_flag
()¶ Get synchronous/asynchronous flag.
- Return
- Synchronous call true. Asynchronous call false.
-
int
hl_get_device_count
()¶ Returns the number of compute-capable devices.
-
void
hl_set_device
(int device)¶ Set device to be used.
- Parameters
device
: device id.
-
int
hl_get_device
()¶ Returns which device is currently being used.
- Return
- device device id.
-
void *
hl_malloc_device
(size_t size)¶ Allocate device memory.
- Return
- dest_d pointer to device memory.
- Parameters
size
: size in bytes to copy.
-
void
hl_free_mem_device
(void *dest_d)¶ Free device memory.
- Parameters
dest_d
: pointer to device memory.
-
void *
hl_malloc_host
(size_t size)¶ Allocate host page-lock memory.
- Return
- dest_h pointer to host memory.
- Parameters
size
: size in bytes to copy.
-
void
hl_free_mem_host
(void *dest_h)¶ Free host page-lock memory.
- Parameters
dest_h
: pointer to host memory.
-
void
hl_memcpy
(void *dst, void *src, size_t size)¶ Copy data.
- Parameters
dst
: dst memory address(host or device).src
: src memory address(host or device).size
: size in bytes to copy.
-
void
hl_memset_device
(void *dest_d, int value, size_t size)¶ Set device memory to a value.
- Parameters
dest_d
: pointer to device memory.value
: value to set for each byte of specified memory.size
: size in bytes to set.
-
void
hl_memcpy_host2device
(void *dest_d, void *src_h, size_t size)¶ Copy host memory to device memory.
- Parameters
dest_d
: dst memory address.src_h
: src memory address.size
: size in bytes to copy.
-
void
hl_memcpy_device2host
(void *dest_h, void *src_d, size_t size)¶ Copy device memory to host memory.
- Parameters
dest_h
: dst memory address.src_d
: src memory address.size
: size in bytes to copy.
-
void
hl_memcpy_device2device
(void *dest_d, void *src_d, size_t size)¶ Copy device memory to device memory.
- Parameters
dest_d
: dst memory address.src_d
: src memory address.size
: size in bytes to copy.
-
void
hl_rand
(real *dest_d, size_t num)¶ Generate uniformly distributed floats (0, 1.0].
- Parameters
dest_d
: pointer to device memory to store results.num
: number of floats to generate.
-
void
hl_srand
(unsigned int seed)¶ Set the seed value of the random number generator.
- Parameters
seed
: seed value.
-
void
hl_memcpy_async
(void *dst, void *src, size_t size, hl_stream_t stream)¶ Copy data.
- Parameters
dst
: dst memory address(host or device).src
: src memory address(host or device).size
: size in bytes to copy.stream
: stream id.
-
void
hl_stream_synchronize
(hl_stream_t stream)¶ Waits for stream tasks to complete.
- Parameters
stream
: stream id.
-
void
hl_create_event
(hl_event_t *event)¶ Creates an event object.
- Parameters
event
: New event.
-
void
hl_destroy_event
(hl_event_t event)¶ Destroys an event object.
- Parameters
event
: Event to destroy.
-
float
hl_event_elapsed_time
(hl_event_t start, hl_event_t end)¶ Computes the elapsed time between events.
- Return
- time Time between start and end in ms.
- Parameters
start
: Starting event.end
: Ending event.
-
void
hl_stream_record_event
(hl_stream_t stream, hl_event_t event)¶ Records an event.
- Parameters
stream
: Stream in which to insert event.event
: Event waiting to be recorded as completed.
-
void
hl_stream_wait_event
(hl_stream_t stream, hl_event_t event)¶ Make a compute stream wait on an event.
- Parameters
stream
: Stream in which to insert event.event
: Event to wait on.
-
void
hl_event_synchronize
(hl_event_t event)¶ Wait for an event to complete.
- Parameters
event
: event to wait for.
-
void
hl_set_device_flags_block
()¶ Sets block flags to be used for device executions.
- Note
- This interface needs to be called before hl_start.
-
const char *
hl_get_device_error_string
()¶ Returns the last error string from a cuda runtime call.
-
const char *
hl_get_device_error_string
(size_t err)¶ Returns the last error string from a cuda runtime call.
- See
- hl_get_device_last_error()
- Parameters
err
: error number.
-
int
hl_get_device_last_error
()¶ Returns the last error number.
- Return
- error number.
- See
- hl_get_device_error_string()
-
bool
hl_cuda_event_is_ready
(hl_event_t event)¶ check cuda event is ready
- Return
- true cuda event is ready. false cuda event is not ready.
- Parameters
event
: cuda event to query.
-
void
hl_device_synchronize
()¶ hppl device synchronization.
CUDA Wrapper¶
hl_cuda_cublas.h¶
Functions
-
void
hl_matrix_transpose
(real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc)¶ Matrix transpose: C_d = T(A_d)
- Parameters
A_d
: input matrix (dimM x dimN).C_d
: output matrix (dimN x dimM).dimM
: matrix height.dimN
: matrix width.lda
: the first dimension of A_d.ldc
: the first dimension of C_d.
-
void
hl_matrix_transpose
(real *A_d, real *C_d, int dimM, int dimN)¶
-
void
hl_matrix_inverse
(real *A_d, real *C_d, int dimN, int lda, int ldc)¶
-
void
hl_matrix_mul
(real *A_d, hl_trans_op_t transa, real *B_d, hl_trans_op_t transb, real *C_d, int dimM, int dimN, int dimK, real alpha, real beta, int lda, int ldb, int ldc)¶ C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
- Parameters
A_d
: input.transa
: operation op(A) that is non-or transpose.B_d
: input.transb
: operation op(B) that is non-or transpose.C_d
: output.dimM
: matrix height of op(A) & CdimN
: matrix width of op(B) & CdimK
: width of op(A) & height of op(B)alpha
: scalar used for multiplication.beta
: scalar used for multiplication.lda
: the first dimension of A_d.ldb
: the first dimension of B_d.ldc
: the first dimension of C_d.
-
void
hl_matrix_mul
(real *A_d, hl_trans_op_t transa, real *B_d, hl_trans_op_t transb, real *C_d, int dimM, int dimN, int dimK, real alpha, real beta)¶ C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
- Parameters
A_d
: input.transa
: operation op(A) that is non-or transpose.B_d
: input.transb
: operation op(B) that is non-or transpose.C_d
: output.dimM
: matrix height of op(A) & CdimN
: matrix width of op(B) & CdimK
: width of op(A) & height of op(B)alpha
: scalar used for multiplication.beta
: scalar used for multiplication.
-
void
hl_matrix_mul_vector
(real *A_d, hl_trans_op_t trans, real *B_d, real *C_d, int dimM, int dimN, real alpha, real beta, int lda, int incb, int incc)¶ This function performs the matrix-vector multiplication. C_d = alpha*op(A_d)*B_d + beta*C_d.
- Parameters
A_d
: matrix.trans
: operation op(A) that is non-or transpose.B_d
: vector with dimN(dimM) elements if trans==HPPL_OP_N(HPPL_OP_T).C_d
: vector with dimM(dimN) elements if trans==HPPL_OP_N(HPPL_OP_T).dimM
: number of rows of matrix A_d.dimN
: number of columns of matrix A_d.alpha
: scalar used for multiplication.beta
: scalar used for multiplication.lda
: the first dimension of A_d.incb
: increase B_d size for compaction.incc
: increase C_d size for compaction.
-
void
hl_matrix_mul_vector
(real *A_d, hl_trans_op_t trans, real *B_d, real *C_d, int dimM, int dimN, real alpha, real beta)¶ This function performs the matrix-vector multiplication. C_d = alpha*op(A_d)*B_d + beta*C_d.
- Parameters
A_d
: matrix.trans
: operation op(A) that is non-or transpose.B_d
: vector with dimN(dimM) elements if trans==HPPL_OP_N(HPPL_OP_T).C_d
: vector with dimM(dimN) elements if trans==HPPL_OP_N(HPPL_OP_T).dimM
: number of rows of matrix A_d.dimN
: number of columns of matrix A_d.alpha
: scalar used for multiplication.beta
: scalar used for multiplication.
hl_cuda_cudnn.h¶
Typedefs
-
typedef struct _hl_tensor_descriptor *
hl_tensor_descriptor
¶ hppl image descriptor.
-
typedef struct _hl_pooling_descriptor *
hl_pooling_descriptor
¶ hppl pooling descriptor.
-
typedef struct _hl_filter_descriptor *
hl_filter_descriptor
¶ hppl filter descriptor.
-
typedef struct _hl_convolution_descriptor *
hl_convolution_descriptor
¶ hppl filter descriptor.
Enums
Functions
-
int
hl_get_cudnn_lib_version
()¶ return cudnn lib version
-
void
hl_create_tensor_descriptor
(hl_tensor_descriptor *image_desc)¶ create image descriptor.
- Parameters
image_desc
: image descriptor.
-
void
hl_tensor_reshape
(hl_tensor_descriptor image_desc, int batch_size, int feature_maps, int height, int width)¶ reshape image descriptor.
- Parameters
image_desc
: image descriptor.batch_size
: input batch size.feature_maps
: image feature maps.height
: image height.width
: image width.
-
void
hl_tensor_reshape
(hl_tensor_descriptor image_desc, int batch_size, int feature_maps, int height, int width, int nStride, int cStride, int hStride, int wStride)¶ reshape image descriptor.
- Parameters
image_desc
: image descriptor.batch_size
: input batch size.feature_maps
: image feature maps.height
: image height.width
: image width.nStride
: stride between two consecutive images.cStride
: stride between two consecutive feature maps.hStride
: stride between two consecutive rows.wStride
: stride between two consecutive columns.
-
void
hl_destroy_tensor_descriptor
(hl_tensor_descriptor image_desc)¶ destroy image descriptor.
- Parameters
image_desc
: hppl image descriptor.
-
void
hl_create_pooling_descriptor
(hl_pooling_descriptor *pooling_desc, hl_pooling_mode_t mode, int height, int width, int height_padding, int width_padding, int stride_height, int stride_width)¶ create pooling descriptor.
- Parameters
pooling_desc
: pooling descriptor.mode
: pooling mode.height
: height of the pooling window.width
: width of the pooling window.height_padding
: padding height.width_padding
: padding width.stride_height
: pooling vertical stride.stride_width
: pooling horizontal stride.
-
void
hl_destroy_pooling_descriptor
(hl_pooling_descriptor pooling_desc)¶ destroy pooling descriptor.
- Parameters
pooling_desc
: hppl pooling descriptor.
-
void
hl_pooling_forward
(hl_tensor_descriptor input, real *input_image, hl_tensor_descriptor output, real *output_image, hl_pooling_descriptor pooling)¶ pooling forward(calculate output image).
- Parameters
input
: input image descriptor.input_image
: input image data.output
: output image descriptor.output_image
: output image data.pooling
: pooling descriptor.
-
void
hl_pooling_backward
(hl_tensor_descriptor input, real *input_image, real *input_image_grad, hl_tensor_descriptor output, real *output_image, real *output_image_grad, hl_pooling_descriptor pooling)¶ pooling backward(calculate input image gradient).
- Parameters
input
: input image descriptor.input_image
: input image data.input_image_grad
: input image gradient data.output
: output image descriptor.output_image
: output image data.output_image_grad
: output image gradient data.pooling
: pooling descriptor.
-
void
hl_create_filter_descriptor
(hl_filter_descriptor *filter, int input_feature_maps, int output_feature_maps, int height, int width)¶ create filter descriptor.
- Parameters
filter
: filter descriptor.input_feature_maps
: input image feature maps.output_feature_maps
: output image feature maps.height
: filter height.width
: filter width.
-
void
hl_conv_workspace
(hl_tensor_descriptor input, hl_tensor_descriptor output, hl_filter_descriptor filter, hl_convolution_descriptor conv, int *convFwdAlgo, size_t *fwdLimitBytes, int *convBwdDataAlgo, size_t *bwdDataLimitBytes, int *convBwdFilterAlgo, size_t *bwdFilterLimitBytes)¶ convolution workspace configuration
- Parameters
input
: image descriptoroutput
: image descriptorfilter
: filter descriptorconv
: convolution descriptorconvFwdAlgo
: forward algorithmfwdLimitBytes
: forward workspace sizeconvBwdDataAlgo
: backward data algorithmbwdDataLimitBytes
: backward data workspace sizeconvBwdFilterAlgo
: backward filter algorithmbwdFilterLimitBytes
: backward filter workspace size
-
void
hl_destroy_filter_descriptor
(hl_filter_descriptor filter)¶ destroy filter descriptor.
- Parameters
filter
: hppl filter descriptor.
-
void
hl_create_convolution_descriptor
(hl_convolution_descriptor *conv, hl_tensor_descriptor image, hl_filter_descriptor filter, int padding_height, int padding_width, int stride_height, int stride_width)¶ create convolution descriptor.
- Parameters
conv
: conv descriptor.image
: input image descriptor.filter
: filter descriptor.padding_height
: padding height.padding_width
: padding width.stride_height
: stride height.stride_width
: stride width.
-
void
hl_reset_convolution_descriptor
(hl_convolution_descriptor conv, hl_tensor_descriptor image, hl_filter_descriptor filter, int padding_height, int padding_width, int stride_height, int stride_width)¶ reset convolution descriptor.
- Parameters
conv
: conv descriptor.image
: input image descriptor.filter
: filter descriptor.padding_height
: padding height.padding_width
: padding width.stride_height
: stride height.stride_width
: stride width.
-
void
hl_destroy_convolution_descriptor
(hl_convolution_descriptor conv)¶ destroy convolution descriptor.
- Parameters
conv
: hppl convolution descriptor.
-
void
hl_convolution_forward
(hl_tensor_descriptor input, real *input_data, hl_tensor_descriptor output, real *output_data, hl_filter_descriptor filter, real *filter_data, hl_convolution_descriptor conv, void *gpuWorkSpace, size_t sizeInBytes, int convFwdAlgo)¶ convolution forward(calculate output image).
- Parameters
input
: input image descriptor.input_data
: input image data.output
: output image descriptor.output_data
: output image data.filter
: filter descriptor.filter_data
: filter data.conv
: convolution descriptor.gpuWorkSpace
: limited gpu workspace.sizeInBytes
: gpu workspace size (bytes).convFwdAlgo
: forward algorithm.
-
void
hl_convolution_forward_add_bias
(hl_tensor_descriptor bias, real *bias_data, hl_tensor_descriptor output, real *output_data)¶ convolution forward add bias(calculate output add bias).
- Parameters
bias
: bias descriptor.bias_data
: bias data.output
: output image descriptor.output_data
: output image data.
-
void
hl_convolution_backward_filter
(hl_tensor_descriptor input, real *input_data, hl_tensor_descriptor output, real *output_grad_data, hl_filter_descriptor filter, real *filter_grad_data, hl_convolution_descriptor conv, void *gpuWorkSpace, size_t sizeInBytes, int convBwdFilterAlgo)¶ convolution backward filter(calculate filter grad data).
- Parameters
input
: input image descriptor.input_data
: input image data.output
: output image descriptor.output_grad_data
: output image grad data.filter
: filter descriptor.filter_grad_data
: filter grad data.conv
: convolution descriptor.gpuWorkSpace
: limited gpu workspace.sizeInBytes
: gpu workspace size (bytes).convBwdFilterAlgo
: backward filter algorithm.
-
void
hl_convolution_backward_data
(hl_tensor_descriptor input, real *input_data_grad, hl_tensor_descriptor output, real *output_grad_data, hl_filter_descriptor filter, real *filter_data, hl_convolution_descriptor conv, void *gpuWorkSpace, size_t sizeInBytes, int convBwdDataAlgo)¶ convolution backward data(calculate input image grad data).
- Parameters
input
: input image descriptor.input_data_grad
: input image grad data.output
: output image descriptor.output_grad_data
: output image grad data.filter
: filter descriptor.filter_data
: filter data.conv
: convolution descriptor.gpuWorkSpace
: limited gpu workspace.sizeInBytes
: gpu workspace size (bytes).convBwdDataAlgo
: backward data algorithm.
-
void
hl_convolution_backward_bias
(hl_tensor_descriptor bias, real *bias_grad_data, hl_tensor_descriptor output, real *output_grad_data)¶ convolution backward bias(calculate bias grad data).
- Parameters
bias
: bias descriptor.bias_grad_data
: bias grad data.output
: output image descriptor.output_grad_data
: output image grad data.
-
void
hl_softmax_forward
(real *input, real *output, int height, int width)¶ softmax forward.
- Parameters
input
: input value.output
: output value.height
: matrix height.width
: matrix width.
-
void
hl_softmax_backward
(real *output_value, real *output_grad, int height, int width)¶ softmax backward.
- Parameters
output_value
: output value data.output_grad
: output grad data.height
: matrix height.width
: matrix width.
-
void
hl_batch_norm_forward_training
(hl_tensor_descriptor inputDesc, real *input, hl_tensor_descriptor outputDesc, real *output, hl_tensor_descriptor bnParamDesc, real *scale, real *bias, double factor, real *runningMean, real *runningInvVar, double epsilon, real *savedMean, real *savedVar)¶ cudnn batch norm forward.
- Parameters
inputDesc
: input tensor descriptor desc.input
: input data.outputDesc
: output tensor descriptor desc.output
: output data.bnParamDesc
: tensor descriptor desc. bnScale, bnBias, running mean/var, save_mean/var.scale
: batch normalization scale parameter (in original paper scale is referred to as gamma).bias
: batch normalization bias parameter (in original paper scale is referred to as beta).factor
: Factor used in the moving average computation. runningMean = newMean * factor- runningMean * (1 - factor)
runningMean
: running mean.runningInvVar
: running variance.epsilon
: Epsilon value used in the batch normalization formula.savedMean
: optional cache to save intermediate results.savedVar
: optional cache to save intermediate results.
-
void
hl_batch_norm_forward_inference
(hl_tensor_descriptor inputDesc, real *input, hl_tensor_descriptor outputDesc, real *output, hl_tensor_descriptor bnParamDesc, real *scale, real *bias, real *estimatedMean, real *estimatedVar, double epsilon)¶ cudnn batch norm forward.
- Parameters
inputDesc
: input tensor descriptor desc.input
: input data.outputDesc
: output tensor descriptor desc.output
: output data.bnParamDesc
: tensor descriptor desc. bnScale, bnBias, running mean/var, save_mean/var.scale
: batch normalization scale parameter (in original paper scale is referred to as gamma).bias
: batch normalization bias parameter (in original paper scale is referred to as beta).estimatedMean
:estimatedVar
: It is suggested that resultRunningMean, resultRunningVariance from the cudnnBatchNormalizationForwardTraining call accumulated during the training phase are passed as inputs here.epsilon
: Epsilon value used in the batch normalization formula.
-
void
hl_batch_norm_backward
(hl_tensor_descriptor inputDesc, real *input, hl_tensor_descriptor outGradDesc, real *outGrad, hl_tensor_descriptor inGradDesc, real *inGrad, hl_tensor_descriptor dBnParamDesc, real *scale, real *scaleGrad, real *biasGrad, double epsilon, real *savedMean, real *savedInvVar)¶ cudnn batch norm forward.
- Parameters
inputDesc
: input tensor descriptor desc.input
: input data.outGradDesc
: output tensor descriptor desc.outGrad
: output data.inGradDesc
: input tensor descriptor desc.inGrad
: input data.dBnParamDesc
: tensor descriptor desc. bnScale, bnBias, running mean/var, save_mean/var.scale
: batch normalization scale parameter (in original paper scale is referred to as gamma).scaleGrad
: batch normalization scale parameter (in original paper scale is referred to as gamma) gradient.biasGrad
: batch normalization bias parameter (in original paper scale is referred to as beta) gradient.epsilon
: Epsilon value used in the batch normalization formula.savedMean
: optional cache to save intermediate results.savedInvVar
: optional cache to save intermediate results.
hl_cuda_cudnn.h¶
Defines
-
HL_CUDA_CUDNN_PH_
¶
-
GET_TENSOR_DESCRIPTOR
(image)¶
-
GET_FILTER_DESCRIPTOR
(filter)¶
-
GET_CONVOLUTION_DESCRIPTOR
(conv)¶
Typedefs
-
typedef struct _cudnn_tensor_descriptor *
cudnn_tensor_descriptor
¶
-
typedef struct _cudnn_pooling_descriptor *
cudnn_pooling_descriptor
¶
-
typedef struct _cudnn_filter_descriptor *
cudnn_filter_descriptor
¶
-
typedef struct _cudnn_convolution_descriptor *
cudnn_convolution_descriptor
¶
-
struct
_cudnn_tensor_descriptor
¶
-
struct
_cudnn_pooling_descriptor
¶
-
struct
_cudnn_filter_descriptor
¶
-
struct
_cudnn_convolution_descriptor
¶