diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index eb454c59c1e58cf2b4817b4cb3230b9d75e320ac..da2ed8cabb766d6c951e8dd5ff2e8e87189979f6 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -224,4 +224,62 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
 extern void hl_matrix_rotate(
     real* mat, real* matRot, int dimM, int dimN, bool clockWise);
 
+/**
+ * @brief  Matrix vol2Col: Convert 3D volume into col matrix
+ *
+ * @param[in]   matSrc     input matrix.
+ * @param[in]   channel    channel of matSrc.
+ * @param[in]   depth      depth of matSrc.
+ * @param[in]   height     height of matSrc.
+ * @param[in]   width      width of matSrc.
+ * @param[in]   filterD    depth of filter.
+ * @param[in]   filterH    height of filter.
+ * @param[in]   filterW    width of filter.
+ * @param[in]   strideD    stride in the depth.
+ * @param[in]   strideH    stride in the height.
+ * @param[in]   strideW    stride in the width.
+ * @param[in]   paddingD   padding in the depth.
+ * @param[in]   paddingH   padding in the height.
+ * @param[in]   paddingW   padding in the width. 
+ * @param[out]   matDst     output matrix.
+ * 
+ */
+extern void hl_matrix_vol2Col(real* matSrc,
+                 int channel, int depth, int height, int width,
+                 int filterD, int filterH, int filterW,
+                 int strideD, int strideH, int strideW,
+                 int paddingD, int paddingH, int paddingW,
+                 real* matDst);
+
+/**
+ * @brief  Matrix col2Vol: Convert col matrix into 3D volume
+ *
+ * @param[out]  matDst     output matrix.
+ * @param[in]   channel    channel of matDst.
+ * @param[in]   depth      depth of matDst.
+ * @param[in]   height     height of matDst.
+ * @param[in]   width      width of matDst.
+ * @param[in]   filterD    depth of filter.
+ * @param[in]   filterH    height of filter.
+ * @param[in]   filterW    width of filter.
+ * @param[in]   strideD    stride in the depth.
+ * @param[in]   strideH    stride in the height.
+ * @param[in]   strideW    stride in the width.
+ * @param[in]   paddingD   padding in the depth.
+ * @param[in]   paddingH   padding in the height.
+ * @param[in]   paddingW   padding in the width. 
+ * @param[in]   matSrc     input matrix.
+ * @param[in]   beta       input 
+ * @param[in]   alpha      input 
+ * 
+ */
+extern void hl_matrix_col2Vol(real* matDst,
+                int channels, int depth, int height, int width,
+                int filterD, int filterH, int filterW,
+                int strideD, int strideH, int strideW,
+                int paddingD, int paddingH, int paddingW,
+                real* matSrc,
+                real alpha, real beta);
+
+
 #endif /* HL_MATRIX_H_ */
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 127cb7e27983e8ff2c1ff6ef5108b5f8c5bd6ca5..0b73777812ae669bcd18930eef3198dbfe7081c7 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -99,4 +99,19 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
 inline void hl_matrix_rotate(
     real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
 
+inline void hl_matrix_vol2Col(real* data,
+                       int channels, int depth, int height, int width,
+                       int filterD, int filterH, int filterW,
+                       int strideD, int strideH, int strideW,
+                       int paddingD, int paddingH, int paddingW,
+                       real* data_col) {}
+
+inline void hl_matrix_col2Vol(real* data,
+                       int channels, int depth, int height, int width,
+                       int filterD, int filterH, int filterW,
+                       int strideD, int strideH, int strideW,
+                       int paddingD, int paddingH, int paddingW,
+                       real* data_Im,
+                       real alpha, real beta) {}
+
 #endif  // HL_MATRIX_STUB_H_
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 39272456c394adc0509e60cf5972df832f7b3424..f626c07a0c39afbf8bb7575f695e8104b86a95ea 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -592,3 +592,138 @@ void hl_matrix_rotate(
       mat, matRot, dimM, dimN, clockWise);
   CHECK_SYNC("hl_matrix_rotate failed");
 }
+
+
+__global__ void keMatrixVol2Col(
+        int num_kernels, real*dataSrc, real* dataDst,
+        int depth, int height, int width,
+        int filterD, int filterH, int filterW,
+        int strideD, int strideH, int strideW,
+        int paddingD, int paddingH, int paddingW,
+        int depth_col, int height_col, int width_col){
+
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
+       index < num_kernels;
+       index += blockDim.x * gridDim.x){
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col ) % height_col;
+    int d_out = (index / width_col / height_col) % depth_col;
+    int channel_in = index / width_col / height_col / depth_col;
+    int channel_out = channel_in * filterD * filterH * filterW;
+    int w_in = w_out * strideW - paddingW;
+    int h_in = h_out * strideH - paddingH;
+    int d_in = d_out * strideD - paddingD;
+
+    dataDst += ((channel_out * depth_col + d_out) * height_col + h_out) * width_col + w_out;
+    dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
+    for (int k = 0; k < filterD; ++k) {
+      for (int i = 0; i < filterH; ++i) {
+        for (int j = 0; j < filterW; ++j) {
+          int d = d_in + k;
+          int h = h_in + i;
+          int w = w_in + j;
+          *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 && w < width ) ?
+                      dataSrc[(k * height + i) * width + j] : 0;
+          dataDst += depth_col * height_col * width_col;
+        }
+      }
+    }
+  }
+}
+
+void hl_matrix_vol2Col(real* dataSrc,
+                       int channels, int depth, int height, int width,
+                       int filterD, int filterH, int filterW,
+                       int strideD, int strideH, int strideW,
+                       int paddingD, int paddingH, int paddingW, real* dataDst){
+
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth_col * height_col * width_col;
+
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+
+  keMatrixVol2Col<<< blocks, threads >>>(
+          num_kernels, dataSrc, dataDst,
+                  depth, height, width,
+                  filterD, filterH, filterW,
+                  strideD, strideH, strideW,
+                  paddingD, paddingH, paddingW,
+                  depth_col, height_col, width_col);
+  CHECK_SYNC("hl_matrix_vol2Col failed");
+}
+
+__global__ void keMatrixCol2Vol(
+        int num_kernels, real*dataDst, real* dataSrc,
+        int depth, int height, int width,
+        int filterD, int filterH, int filterW,
+        int strideD, int strideH, int strideW,
+        int paddingD, int paddingH, int paddingW,
+        int depth_col, int height_col, int width_col,
+        real alpha, real beta){
+
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
+       index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+
+    real val = 0;
+    int w = index % width + paddingW;
+    int h = (index / width) % height + paddingH;
+    int d = (index / width / height) % depth + paddingD;
+    int c = index / (width * height * depth);
+    // compute the start and end of the output
+    int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
+    int w_col_end = min(w / strideW + 1, width_col);
+    int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
+    int h_col_end = min(h / strideH + 1, height_col);
+    int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
+    int d_col_end = min(d / strideD + 1, depth_col);
+
+    int offset = (c * filterD * filterW * filterH + \
+                  d * filterW * filterH + h * filterW + w) * depth_col * height_col * width_col;
+
+    int coeff_d_col = (1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
+    int coeff_h_col = (1 - strideH * filterW * depth_col * height_col) * width_col;
+    int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
+
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          val += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col + w_col * coeff_w_col];
+        }
+      }
+    }
+    dataDst[index] = val;
+  }
+}
+
+void hl_matrix_col2Vol(real* dataDst,
+                       int channels, int depth, int height, int width,
+                       int filterD, int filterH, int filterW,
+                       int strideD, int strideH, int strideW,
+                       int paddingD, int paddingH, int paddingW,
+                       real* dataSrc,
+                       real alpha, real beta){
+
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth * height * width;
+
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+
+  keMatrixCol2Vol<<< blocks, threads >>>(
+          num_kernels, dataDst, dataSrc,
+                  depth, height, width,
+                  filterD, filterH, filterW,
+                  strideD, strideH, strideW,
+                  paddingD, paddingH, paddingW,
+                  depth_col, height_col, width_col,
+                  alpha, beta);
+
+  CHECK_SYNC("hl_matrix_col2Vol failed");
+}