hl_cuda_matrix.cu

/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
#include "hl_matrix.h"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_ops.cuh"
#include "hl_sequence.h"
#include "hl_sparse.ph"
#include "paddle/utils/Logging.h"

DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
void hl_matrix_add(real* A_d,
                   real* B_d,
                   real* C_d,
                   int dimM,
                   int dimN,
                   real alpha,
                   real beta) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(B_d);
  CHECK_NOTNULL(C_d);

  hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
      ternary::_add<real>(alpha, beta),
      A_d,
      B_d,
      C_d,
      dimM,
      dimN,
      dimN,
      dimN,
      dimN);
  CHECK_SYNC("hl_matrix_add failed");
}

#ifdef PADDLE_TYPE_DOUBLE
#define THRESHOLD 128
#else
#define THRESHOLD 64
#endif
__device__ __forceinline__ void findMax(real* I,
                                        real* dfMax_s,
                                        int blockSize,
                                        int base,
                                        int curIdx,
                                        int nextIdx,
                                        int dimN,
                                        real* max) {
  dfMax_s[base] = -1.0e20;
  while (curIdx < dimN) {
    if (dfMax_s[base] < I[nextIdx]) {
      dfMax_s[base] = I[nextIdx];
    }
    nextIdx += blockSize;
    curIdx += blockSize;
  }
  __syncthreads();

  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
    __syncthreads();
    if (base < stride) {
      nextIdx = base + stride;
      if (dfMax_s[base] < dfMax_s[nextIdx]) {
        dfMax_s[base] = dfMax_s[nextIdx];
      }
    }
  }

  if (0 == base) {
    max[0] = dfMax_s[0];
  }
  __syncthreads();
}

__device__ __forceinline__ void subMaxAndExp(real* I,
                                             real* O,
                                             int curIdx,
                                             int nextIdx,
                                             int blockSize,
                                             int dimN,
                                             real max) {
  real val;
  while (curIdx < dimN) {
    val = I[nextIdx] - max;
    if (val < -THRESHOLD) {
      val = -THRESHOLD;
    }
    I[nextIdx] = val;
#ifndef PADDLE_TYPE_DOUBLE
    O[nextIdx] = __expf(val);
#else
    O[nextIdx] = exp(val);
#endif
    nextIdx += blockSize;
    curIdx += blockSize;
  }
  __syncthreads();
}

__device__ __forceinline__ void valueSum(real* O,
                                         real* dfMax_s,
                                         int blockSize,
                                         int base,
                                         int curIdx,
                                         int nextIdx,
                                         int dimN) {
  dfMax_s[base] = 0;
  while (curIdx < dimN) {
    dfMax_s[base] += O[nextIdx];
    nextIdx += blockSize;
    curIdx += blockSize;
  }
  __syncthreads();

  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
    __syncthreads();
    if (base < stride) {
      nextIdx = base + stride;
      dfMax_s[base] += dfMax_s[nextIdx];
    }
  }
  __syncthreads();
}

__device__ __forceinline__ void divSum(
    real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
  while (curIdx < dimN) {
    O[nextIdx] /= sum;
    nextIdx += blockSize;
    curIdx += blockSize;
  }
}

__device__ __forceinline__ void softmax(real* I,
                                        real* O,
                                        real* dfMax_s,
                                        int blockSize,
                                        int base,
                                        int curIdx,
                                        int nextIdx,
                                        int dimN) {
  __shared__ real max;

  // find the max number
  findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);

  // sub max Value and do Exp operation
  subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);

  // add dimN values into blockDim.x buffer
  // sum is in dfMax_s[0]
  valueSum(O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);

  // divided by sum
  divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
}

template <int blockSize>
__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
  int base = threadIdx.x;
  __shared__ real dfMax_s[blockSize];
  int nextIdx = blockIdx.x * dimN + base;
  int curIdx = base;

  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
}

void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);

  dim3 block(512, 1);
  dim3 grid(dimM, 1);
  KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
  CHECK_SYNC("hl_matrix_softmax failed");
}

template <int blockSize>
__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
  int base = threadIdx.x;
  int bid = blockIdx.x;
  __shared__ real dfMax_s[blockSize];

  int start = index[bid];
  int dimN = index[bid + 1] - start;

  int nextIdx = start + base;
  int curIdx = base;

  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
}

void hl_sequence_softmax_forward(real* A_d,
                                 real* C_d,
                                 const int* index,
                                 int numSequence) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);

  dim3 block(512, 1);
  dim3 grid(numSequence, 1);
  KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
  CHECK_SYNC("hl_sequence_softmax_forward failed");
}

__global__ void KeMatrixDerivative(
    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
  int index;

  if (rowIdx < dimM && colIdx < dimN) {
    index = rowIdx * dimN + colIdx;
    grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
  }
}

void hl_matrix_softmax_derivative(
    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
  CHECK_NOTNULL(grad_d);
  CHECK_NOTNULL(output_d);
  CHECK_NOTNULL(sftmaxSum_d);

  int blocksX = (dimM + 0) / 1;
  int blocksY = (dimN + 1024 - 1) / 1024;
  dim3 threads(1, 1024);
  dim3 grid(blocksX, blocksY);

  KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
      grad_d, output_d, sftmaxSum_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_softmax_derivative failed");
}

__global__ void KeMatrixMultiBinaryCrossEntropy(
    real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < dimM) {
    for (int i = 0; i < dimN; i++) {
      entropy[index] -= log(1 - output[index * dimN + i]);
    }
    int* row_col = col + row[index];
    int col_num = row[index + 1] - row[index];
    for (int i = 0; i < col_num; i++) {
      real o = output[index * dimN + row_col[i]];
      entropy[index] -= log(o / (1 - o));
    }
  }
}

void hl_matrix_multi_binary_cross_entropy(real* output,
                                          real* entropy,
                                          hl_sparse_matrix_s csr_mat,
                                          int dimM,
                                          int dimN) {
  CHECK_NOTNULL(output);
  CHECK_NOTNULL(entropy);
  CHECK_NOTNULL(csr_mat);
  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
  int n_threads = 1024;
  int blocks = (dimM + n_threads - 1) / n_threads;
  dim3 threads(n_threads);
  dim3 grid(blocks);
  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
  KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
      output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
}

__global__ void KeMatrixMultiBinaryCrossEntropyBp(
    real* output, real* grad, int* row, int* col, int dimM, int dimN) {
  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (row_idx < dimM) {
    for (int i = 0; i < dimN; i++) {
      int index = row_idx * dimN + i;
      grad[index] += 1.0 / (1 - output[index]);
    }
    int col_num = row[row_idx + 1] - row[row_idx];
    int* row_col = col + row[row_idx];
    for (int i = 0; i < col_num; i++) {
      int index = row_idx * dimN + row_col[i];
      grad[index] -= 1.0 / (output[index] * (1 - output[index]));
    }
  }
}

void hl_matrix_multi_binary_cross_entropy_bp(
    real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
  CHECK_NOTNULL(output);
  CHECK_NOTNULL(grad);
  CHECK_NOTNULL(csr_mat);
  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
  int n_threads = 1024;
  int blocks = (dimM + n_threads - 1) / n_threads;
  dim3 threads(n_threads);
  dim3 grid(blocks);
  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
  KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
      output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
}

__global__ void KeMatrixCrossEntropy(
    real* O, real* E, int* label, int dimM, int dimN) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int newBase;
  if (index < dimM) {
    newBase = label[index];
    newBase = newBase % dimN;
    E[index] = -log(O[index * dimN + newBase]);
  }
}

void hl_matrix_cross_entropy(
    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);

  int blocks = (dimM + 1024 - 1) / 1024;
  dim3 threads(1024, 1);
  dim3 grid(blocks, 1);
  KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
      A_d, C_d, label_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_cross_entropy failed");
}

__global__ void KeMatrixCrossEntropyBp(
    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
  int index;
  if (rowIdx < dimM && colIdx < dimN) {
    index = rowIdx * dimN + colIdx;
    if (label_d[rowIdx] == colIdx) {
      grad_d[index] -= 1.0f / output_d[index];
    }
  }
}

void hl_matrix_cross_entropy_bp(
    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
  CHECK_NOTNULL(grad_d);
  CHECK_NOTNULL(output_d);
  CHECK_NOTNULL(label_d);

  int blocksX = (dimM + 0) / 1;
  int blocksY = (dimN + 1024 - 1) / 1024;
  dim3 threads(1, 1024);
  dim3 grid(blocksX, blocksY);
  KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
      grad_d, output_d, label_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
}

void hl_matrix_zero_mem(real* data, int num) {
  hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
}

__global__ void KeParamReluForward(real* output,
                                   real* input,
                                   real* w,
                                   int width,
                                   int height,
                                   int partial_sum) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  int ty = blockIdx.y * blockDim.y + threadIdx.y;
  if (tx < width && ty < height) {
    int index = ty * width + tx;
    output[index] =
        input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
  }
}

void hl_param_relu_forward(real* output,
                           real* input,
                           real* w,
                           int width,
                           int height,
                           int partial_sum) {
  CHECK_NOTNULL(output);
  CHECK_NOTNULL(input);
  CHECK_NOTNULL(w);
  dim3 threads(16, 16);
  int blockX = (width + 16 - 1) / 16;
  int blockY = (height + 16 - 1) / 16;
  dim3 grid(blockX, blockY);
  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
      output, input, w, width, height, partial_sum);
  CHECK_SYNC("hl_param_relu_forward failed");
}

template <int blockSize>
__global__ void KeParamReluBackWardW(real* grad_w,
                                     real* grad_o,
                                     real* input,
                                     int width,
                                     int height,
                                     int partial_sum) {
  const int tid = threadIdx.x;
  __shared__ real temp[blockSize];
  grad_o += partial_sum * blockIdx.x;
  input += partial_sum * blockIdx.x;
  real tmp = 0.0;
  for (int index = tid; index < partial_sum * height; index += blockSize) {
    int row = index / partial_sum;
    int offset = row * width + (index - row * partial_sum);
    if (input[offset] < 0) {
      tmp += grad_o[offset] * input[offset];
    }
  }
  temp[tid] = tmp;
  __syncthreads();
  for (int s = blockSize / 2; s > 0; s >>= 1) {
    if (tid < s) {
      temp[tid] += temp[tid + s];
    }
    __syncthreads();
  }
  if (tid == 0) {
    grad_w[blockIdx.x] += temp[0];
  }
}

void hl_param_relu_backward_w(real* grad_w,
                              real* grad_o,
                              real* input,
                              int width,
                              int height,
                              int partial_sum) {
  CHECK_NOTNULL(grad_w);
  CHECK_NOTNULL(grad_o);
  CHECK_NOTNULL(input);
  const int blockSize = 1024;
  int grid_num = width / partial_sum;
  dim3 threads(blockSize, 1);
  dim3 grid(grid_num, 1);
  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
      grad_w, grad_o, input, width, height, partial_sum);
  CHECK_SYNC("hl_param_relu_backward_w failed");
}

__global__ void KeParamReluBackwardDiff(real* grad_o,
                                        real* input,
                                        real* w,
                                        real* diff,
                                        int width,
                                        int height,
                                        int partial_sum) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  int ty = blockIdx.y * blockDim.y + threadIdx.y;
  if (tx < width && ty < height) {
    int index = ty * width + tx;
    diff[index] += grad_o[index] * (input[index] > 0 ? 1 : w[tx / partial_sum]);
  }
}

void hl_param_relu_backward_diff(real* grad_o,
                                 real* data,
                                 real* w,
                                 real* diff,
                                 int width,
                                 int height,
                                 int partial_sum) {
  CHECK_NOTNULL(grad_o);
  CHECK_NOTNULL(data);
  CHECK_NOTNULL(w);
  CHECK_NOTNULL(diff);
  dim3 threads(16, 16);
  int blockX = (width + 16 - 1) / 16;
  int blockY = (height + 16 - 1) / 16;
  dim3 grid(blockX, blockY);
  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
      grad_o, data, w, diff, width, height, partial_sum);
  CHECK_SYNC("hl_param_relu_backward_diff failed");
}

__global__ void KeMatrixAddSharedBias(
    real* A, real* B, const int channel, const int M, const int N, real scale) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int dim = N / channel;
  if (index < M * N) {
    int i = index % N;
    i = i / dim;
    A[index] += scale * B[i];
  }
}

void hl_matrix_add_shared_bias(real* A_d,
                               real* B_d,
                               const int channel,
                               const int dimM,
                               const int dimN,
                               real scale) {
  const int blocks = 512;
  const int grids = DIVUP(dimM * dimN, blocks);
  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
      A_d, B_d, channel, dimM, dimN, scale);
  CHECK_SYNC("hl_matrix_add_shared_bias failed");
}

template <int blockSize>
__global__ void KeMatrixCollectSharedBias(real* B,
                                          real* A,
                                          const int channel,
                                          const int M,
                                          const int N,
                                          const int dim,
                                          const int limit,
                                          real scale) {
  if (dim < limit) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < channel) {
      real sum = 0.0;
      for (int i = 0; i < M; ++i) {
        for (int j = 0; j < dim; ++j) {
          sum += A[i * N + index * dim + j];
        }
      }
      B[index] += scale * sum;
    }
  } else {
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    __shared__ real smem[blockSize];
    real sum = 0.0;
    for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
      int n = j * blockSize + tid;
      int m = n / dim;
      int w = n % dim;
      smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
      __syncthreads();
      simpleReduce(smem, tid, blockSize);
      sum += smem[0];
    }
    if (tid == 0) {
      B[bid] += scale * sum;
    }
  }
}

void hl_matrix_collect_shared_bias(real* B_d,
                                   real* A_d,
                                   const int channel,
                                   const int dimM,
                                   const int dimN,
                                   real scale) {
  const int dim = dimN / channel;
  const int blocks = 256;
  const int limit = 64;
  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;

  KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
      B_d, A_d, channel, dimM, dimN, dim, limit, scale);
  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
}

__global__ void keMatrixRotate(
    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < dimM * dimN) {
    int i = idx / dimN;
    int j = idx % dimN;
    if (clockWise) {
      matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
    } else {
      matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
    }
  }
}

void hl_matrix_rotate(
    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
  CHECK_NOTNULL(mat);
  CHECK_NOTNULL(matRot);
  const int threads = 512;
  const int blocks = DIVUP(dimM * dimN, threads);
  keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
      mat, matRot, dimM, dimN, clockWise);
  CHECK_SYNC("hl_matrix_rotate failed");
}

__global__ void keMatrixVol2Col(int num_kernels,
                                real* dataSrc,
                                real* dataDst,
                                int depth,
                                int height,
                                int width,
                                int filterD,
                                int filterH,
                                int filterW,
                                int strideD,
                                int strideH,
                                int strideW,
                                int paddingD,
                                int paddingH,
                                int paddingW,
                                int depth_col,
                                int height_col,
                                int width_col) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
       index += blockDim.x * gridDim.x) {
    int w_out = index % width_col;
    int h_out = (index / width_col) % height_col;
    int d_out = (index / width_col / height_col) % depth_col;
    int channel_in = index / width_col / height_col / depth_col;
    int channel_out = channel_in * filterD * filterH * filterW;
    int w_in = w_out * strideW - paddingW;
    int h_in = h_out * strideH - paddingH;
    int d_in = d_out * strideD - paddingD;

    dataDst +=
        ((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
        w_out;
    dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
    for (int k = 0; k < filterD; ++k) {
      for (int i = 0; i < filterH; ++i) {
        for (int j = 0; j < filterW; ++j) {
          int d = d_in + k;
          int h = h_in + i;
          int w = w_in + j;
          *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
                      w < width)
                         ? dataSrc[(k * height + i) * width + j]
                         : 0;
          dataDst += depth_col * height_col * width_col;
        }
      }
    }
  }
}

void hl_matrix_vol2Col(real* dataSrc,
                       int channels,
                       int depth,
                       int height,
                       int width,
                       int filterD,
                       int filterH,
                       int filterW,
                       int strideD,
                       int strideH,
                       int strideW,
                       int paddingD,
                       int paddingH,
                       int paddingW,
                       real* dataDst) {
  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
  int num_kernels = channels * depth_col * height_col * width_col;

  const int threads = 512;
  const int blocks = DIVUP(num_kernels, threads);

  keMatrixVol2Col<<<blocks, threads>>>(num_kernels,
                                       dataSrc,
                                       dataDst,
                                       depth,
                                       height,
                                       width,
                                       filterD,
                                       filterH,
                                       filterW,
                                       strideD,
                                       strideH,
                                       strideW,
                                       paddingD,
                                       paddingH,
                                       paddingW,
                                       depth_col,
                                       height_col,
                                       width_col);
  CHECK_SYNC("hl_matrix_vol2Col failed");
}

__global__ void keMatrixCol2Vol(int num_kernels,
                                real* dataDst,
                                real* dataSrc,
                                int depth,
                                int height,
                                int width,
                                int filterD,
                                int filterH,
                                int filterW,
                                int strideD,
                                int strideH,
                                int strideW,
                                int paddingD,
                                int paddingH,
                                int paddingW,
                                int depth_col,
                                int height_col,
                                int width_col,
                                real alpha,
                                real beta) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
       index += blockDim.x * gridDim.x) {
    real srcVal = 0;
    real dstVal = dataDst[index];
    int w = index % width + paddingW;
    int h = (index / width) % height + paddingH;
    int d = (index / width / height) % depth + paddingD;
    int c = index / width / height / depth;
    // compute the start and end of the output
    int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
    int w_col_end = min(w / strideW + 1, width_col);
    int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
    int h_col_end = min(h / strideH + 1, height_col);
    int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
    int d_col_end = min(d / strideD + 1, depth_col);

    int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
                  h * filterW + w) *
                 depth_col * height_col * width_col;

    int coeff_d_col =
        (1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
    int coeff_h_col =
        (1 - strideH * filterW * depth_col * height_col) * width_col;
    int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);

    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
          srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
                            w_col * coeff_w_col];
        }
      }
    }
    dataDst[index] = alpha * srcVal + beta * dstVal;
  }
}

void hl_matrix_col2Vol(real* dataDst,
                       int channels,
                       int depth,
                       int height,
                       int width,
                       int filterD,
                       int filterH,
                       int filterW,
                       int strideD,
                       int strideH,
                       int strideW,
                       int paddingD,
                       int paddingH,
                       int paddingW,
                       real* dataSrc,
                       real alpha,
                       real beta) {
  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
  int num_kernels = channels * depth * height * width;

  const int threads = 512;
  const int blocks = DIVUP(num_kernels, threads);

  keMatrixCol2Vol<<<blocks, threads>>>(num_kernels,
                                       dataDst,
                                       dataSrc,
                                       depth,
                                       height,
                                       width,
                                       filterD,
                                       filterH,
                                       filterW,
                                       strideD,
                                       strideH,
                                       strideW,
                                       paddingD,
                                       paddingH,
                                       paddingW,
                                       depth_col,
                                       height_col,
                                       width_col,
                                       alpha,
                                       beta);

  CHECK_SYNC("hl_matrix_col2Vol failed");
}