diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index f55197c8c9ebb4a0f67ab915abfefd6a45cd13aa..9f84db72da24b0e678520b077f9cba7ffc2d589a 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -17,73 +17,6 @@ limitations under the License. */
 
 #include "hl_base.h"
 
-/**
- * @brief   Shrink column to feature.
- *
- * @param[in]   dataCol     expand data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   blockH      filter height.
- * @param[in]   blockW      filter width.
- * @param[in]   strideH     stride height.
- * @param[in]   strideW     stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   outputH     output height.
- * @param[in]   outputW     output width.
- * @param[out]  dataIm      output image data.
- * @param[in]   alpha
- * @param[in]   beta
- */
-extern void hl_shrink_col2feature(const real* dataCol,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataIm,
-                                  real alpha = 1.0f,
-                                  real beta = 0.0f);
-
-/**
- * @brief   Expand feature to column.
- *
- * @param[in]   dataIm      input image data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   blockH      filter height.
- * @param[in]   blockW      filter width.
- * @param[in]   strideH     stride height.
- * @param[in]   strideW     stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   outputH     output height.
- * @param[in]   outputW     output width.
- * @param[out]  dataCol     expand data.
- *
- */
-extern void hl_expand_feature2col(const real* dataIm,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataCol);
-
 /**
  * @brief   Maximum pool forward.
  *
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 039551c6cc69525e71c8c311f78fb6dec07d7fed..2bbb9fa8dfd5eeac9d55aa67a28ebfbffa2acd46 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -17,36 +17,6 @@ limitations under the License. */
 
 #include "hl_cnn.h"
 
-inline void hl_shrink_col2feature(const real* dataCol,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataIm,
-                                  real alpha,
-                                  real beta) {}
-
-inline void hl_expand_feature2col(const real* dataIm,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataCol) {}
-
 inline void hl_maxpool_forward(const int frameCnt,
                                const real* inputData,
                                const int channels,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index b94f4d8fe4a251750c527d4b686fcc8f452d4606..b6e3e63a4f52261e49467bd82fdabd063e81460e 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -18,134 +18,6 @@ limitations under the License. */
 #include "hl_cnn.h"
 #include "hl_device_functions.cuh"
 
-__global__ void KeFeature2col(size_t n, size_t height, const real* data_im,
-                              size_t blockH, size_t blockW, size_t width,
-                              size_t strideH, size_t strideW,
-                              size_t paddingH, size_t paddingW,
-                              size_t height_col, size_t width_col,
-                              real* data_col) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    size_t w_out = index % width_col;
-    index /= width_col;
-    size_t h_out = index % height_col;
-    size_t channel_in = index / height_col;
-    size_t channel_out = channel_in * blockH * blockW;
-    size_t h_in = h_out * strideH;
-    size_t w_in = w_out * strideW;
-
-    data_col += (channel_out * height_col + h_out) * width_col + w_out;
-    for (size_t i = 0; i < blockH; ++i) {
-      for (size_t j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in+i);
-        int cIdx = int(w_in+j);
-        if ((rIdx-(int)paddingH) >= (int)height ||
-            (rIdx-(int)paddingH) < 0 ||
-            (cIdx-(int)paddingW) >= (int)width ||
-            (cIdx-(int)paddingW) < 0) {
-          *data_col = 0;
-        } else {
-          rIdx = rIdx + channel_in*height - paddingH;
-          cIdx = cIdx - paddingW;
-          *data_col = data_im[rIdx* width + cIdx];
-        }
-        data_col += height_col * width_col;
-      }
-    }
-  }
-}
-
-void hl_expand_feature2col(const real* dataIm, size_t channels,
-                           size_t height, size_t width,
-                           size_t blockH, size_t blockW,
-                           size_t strideH, size_t strideW,
-                           size_t paddingH, size_t paddingW,
-                           size_t outputH, size_t outputW,
-                           real* dataCol) {
-  size_t numKernels = channels * outputH * outputW;
-
-  size_t blocks = (numKernels + 1024 -1) / 1024;
-  size_t blockX = 512;
-  size_t blockY = (blocks+512-1)/512;
-  dim3 threads(1024, 1);
-  dim3 grid(blockX, blockY);
-  KeFeature2col<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (numKernels, height, dataIm, blockH, blockW, width,
-           strideH, strideW, paddingH, paddingW,
-           outputH, outputW, dataCol);
-  CHECK_SYNC("hl_expand_feature2col failed");
-}
-
-__global__ void KeCol2Feature(size_t n, const real* data_col, size_t height,
-                              size_t width, size_t channels,
-                              size_t blockH, size_t blockW,
-                              size_t strideH, size_t strideW,
-                              size_t paddingH, size_t paddingW,
-                              size_t height_col, size_t width_col,
-                              real* data_im, real alpha, real beta) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    real val = 0;
-    int w = int(index % width);
-    int h = int((index / width) % height);
-    int c = int(index / (width * height));
-    if ((w - (int)paddingW) >= 0 &&
-        (w - (int)paddingW) < (width-2 * paddingW) &&
-        (h - (int)paddingH) >= 0 &&
-        (h - paddingH) < (height - 2 * paddingH)) {
-      // compute the start and end of the output
-      int w_col_start =
-        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
-      int w_col_end =
-        min((int)(w / (int)strideW + 1), (int)(width_col));
-      int h_col_start =
-        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
-      int h_col_end = min(int(h / strideH + 1), int(height_col));
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH* blockW) + \
-            (h - h_col * (int)strideH) * (int)blockW +
-            (w - w_col * (int)strideW);
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
-        }
-      }
-      h -= paddingH;
-      w -= paddingW;
-      real tD = data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-                          h*(width-2*paddingW) + w];
-      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-              h*(width-2*paddingW) + w] = alpha * val + beta*tD;
-    }
-  }
-}
-
-void hl_shrink_col2feature(const real * dataCol, size_t channels,
-                           size_t height, size_t width,
-                           size_t blockH, size_t blockW,
-                           size_t strideH, size_t strideW,
-                           size_t paddingH, size_t paddingW,
-                           size_t outputH, size_t outputW,
-                           real* dataIm, real alpha, real beta) {
-  size_t numKernels = channels * (height + 2*paddingH) * (width + 2*paddingW);
-
-  size_t blocks = (numKernels + 1024 -1) / 1024;
-  size_t blockX = 512;
-  size_t blockY = (blocks+512-1)/512;
-  dim3 threads(1024, 1);
-  dim3 grid(blockX, blockY);
-
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  KeCol2Feature<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (numKernels, dataCol, height + 2*paddingH, width + 2*paddingW,
-           channels, blockH, blockW, strideH, strideW, paddingH, paddingW,
-           outputH, outputW, dataIm, alpha, beta);
-  CHECK_SYNC("hl_shrink_col2feature failed");
-}
-
 __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
                                  const int channels, const int height,
                                  const int width,
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index c910146164ebfb0737583c72c48ce6dbc5b49939..a3ad9d46e4988baf59cacade712e207cf9ba6ff2 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1016,81 +1016,6 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
   LOG(INFO) << "the  diffCnt is " << diffCnt;
 }
 
-void GpuMatrix::convExpand(Matrix& feature,
-                           int feaImgHeight,
-                           int feaImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW) {
-  CHECK(feature.useGpu_ == true) << "Matrix type are not equal";
-
-  CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
-           feature.getHeight() * feature.getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-  CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
-
-  hl_expand_feature2col(feature.getData(),
-                        channels,
-                        feaImgHeight,
-                        feaImgWidth,
-                        blockH,
-                        blockW,
-                        strideH,
-                        strideW,
-                        paddingH,
-                        paddingW,
-                        outputH,
-                        outputW,
-                        getData());
-}
-
-void GpuMatrix::convShrink(Matrix& expandFeat,
-                           int thisImgHeight,
-                           int thisImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW,
-                           real alpha,
-                           real beta) {
-  CHECK(expandFeat.useGpu_ == true) << "Matrix type are not equal";
-  CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
-           getHeight() * getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockW * blockH * channels;
-  CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
-      << "Matrix dimensions are not equal";
-  hl_shrink_col2feature(expandFeat.getData(),
-                        channels,
-                        thisImgHeight,
-                        thisImgWidth,
-                        blockH,
-                        blockW,
-                        strideH,
-                        strideW,
-                        paddingH,
-                        paddingW,
-                        outputH,
-                        outputW,
-                        getData(),
-                        alpha,
-                        beta);
-}
-
 void GpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t imgSizeH,
                                size_t imgSizeW,
@@ -1775,103 +1700,6 @@ void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
   CHECK_EQ(info, 0);
 }
 
-void CpuMatrix::convExpand(Matrix& feature,
-                           int feaImgHeight,
-                           int feaImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW) {
-  CHECK(feature.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
-           feature.getHeight() * feature.getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-  CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
-
-  int channelsCol = channels * blockH * blockW;
-  real* srcData = feature.getData();
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % blockW;
-    int hOffset = (c / blockW) % blockH;
-    int c_im = c / blockH / blockW;
-    for (int h = 0; h < outputH; ++h) {
-      for (int w = 0; w < outputW; ++w) {
-        // no c_im*height to Exclude the channel number
-        int imgRowIdx = h * strideH + hOffset;
-        int imgColIdx = w * strideW + wOffset;
-        if ((imgRowIdx - paddingH) < 0 ||
-            (imgRowIdx - paddingH) >= feaImgHeight ||
-            (imgColIdx - paddingW) < 0 ||
-            (imgColIdx - paddingW) >= feaImgWidth) {
-          data_[(c * outputH + h) * outputW + w] = 0;
-        } else {
-          imgRowIdx += c_im * feaImgHeight - paddingH;
-          imgColIdx -= paddingW;
-          data_[(c * outputH + h) * outputW + w] =
-              srcData[imgRowIdx * feaImgWidth + imgColIdx];
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::convShrink(Matrix& expandFeat,
-                           int thisImgHeight,
-                           int thisImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW,
-                           real alpha,
-                           real beta) {
-  CHECK(expandFeat.useGpu_ == false) << "Matrix type are not equal";
-  CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
-           getHeight() * getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-
-  CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* expandData = expandFeat.getData();
-  int channelsCol = channels * blockH * blockW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % blockW;
-    int hOffset = (c / blockW) % blockH;
-    int c_im = c / blockW / blockH;
-    for (int h = 0; h < outputH; ++h) {
-      for (int w = 0; w < outputW; ++w) {
-        int imRowIdx = h * strideH + hOffset;
-        int imColIdx = w * strideW + wOffset;
-        if ((imRowIdx - paddingH) >= 0 &&
-            (imRowIdx - paddingH) < thisImgHeight &&
-            (imColIdx - paddingW) >= 0 &&
-            (imColIdx - paddingW) < thisImgWidth) {
-          imRowIdx += c_im * thisImgHeight - paddingH;
-          imColIdx -= paddingW;
-          data_[imRowIdx * thisImgWidth + imColIdx] =
-              alpha * expandData[(c * outputH + h) * outputW + w] +
-              beta * data_[imRowIdx * thisImgWidth + imColIdx];
-        }
-      }
-    }
-  }
-}
-
 void CpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t imgSizeH,
                                size_t imgSizeW,
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 748be850b4c902d1b48c1dafbb0d5ea2bf197e6e..bbf98a609c63eeb14e66775960a5b5fc460d7ca6 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -858,49 +858,6 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  /**
-   * This function is used to calculate the convolution:
-   *
-   * It will expand a feature matrix according to the
-   * convolution filters
-   */
-  virtual void convExpand(Matrix& feature,
-                          int feaImgHeight,
-                          int feaImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * This function is the reverse implementation of convExpand:
-   *
-   * Its function is to restore a expanded-matrix into a feature matrix
-   */
-  virtual void convShrink(Matrix& expandColMat,
-                          int thisImgHeight,
-                          int thisImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW,
-                          real alpha = 1.0f,
-                          real beta = 0.0f) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
   /**
    * Pooling forward operation, pick out the largest element
    * in the sizeX of value
@@ -1334,34 +1291,6 @@ public:
 
   void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandColMat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blochW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingWreal,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
   void maxPoolForward(Matrix& inputMat,
                       size_t imgSizeH,
                       size_t imgSizeW,
@@ -1521,34 +1450,6 @@ public:
 
   MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
 
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blcokH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandFeat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
   void maxPoolForward(Matrix& inputMat,
                       size_t imgSizeH,
                       size_t imgSizeW,