diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp
index 426b6c8e312e4b5e7a68370ffcc87e152fdabd26..0c10f30bbd9c129a949b660fb1a2e8122ea18597 100644
--- a/paddle/function/ImageExpandOp.cpp
+++ b/paddle/function/ImageExpandOp.cpp
@@ -119,12 +119,17 @@ public:
         1 +
         (inputWidth + 2 * paddingW() - blockW() + strideW() - 1) / strideW();
     CHECK_EQ(seqLength, outputHeight * outputWidth);
-    CHECK_EQ(stepSize, inputChannels * blockH() * blockH());
+    CHECK_EQ(stepSize, inputChannels * blockH() * blockW());
 
     real* inputData = inputs[0].data<real>();
     real* outputData = outputs[0].data<real>();
     Im2ColFunctor<kOCF, Device, real> im2col;
     for (size_t i = 0; i < batchSize; i++) {
+      // The result of im2col is [output_height, output_width,
+      // input_channels, filter_height, filter_width], and it is easy to
+      // reshape into [seqLength, stepSize], where seqLength is equal
+      // output_height * output_width, stepSize is equal
+      // input_channels * filter_height * filter_width
       im2col(inputData,
              inputChannels,
              inputHeight,
@@ -161,4 +166,6 @@ protected:
   inline int blockW() const { return blocks_[1]; }
 };
 
+REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandFunction);
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index 2bafeb92158c56efe32f90742807f0af07bda5af..9760d39bb4a3c8f94a2ffbd1fbd2ff7438298b9b 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -37,6 +37,18 @@ bool BlockExpandLayer::init(const LayerMap& layerMap,
   imgSizeH_ = blockConf.img_size_y();
   imgSizeW_ = blockConf.img_size_x();
 
+  if (!useGpu_) {
+    std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
+    std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
+    std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
+    createFunction(forward_,
+                   "ImageExpand",
+                   FuncConfig()
+                       .set("strides", strides)
+                       .set("paddings", paddings)
+                       .set("blocks", blocks));
+  }
+
   return true;
 }
 
@@ -63,10 +75,11 @@ void BlockExpandLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-
   size_t blockNum = getBlockNum();
   size_t blockSize = blockH_ * blockW_ * channels_;
   resetOutput(blockNum * batchSize, blockSize);
+  // TODO(hedaoyuan): After completing the GPU version of ImageExpand,
+  // refactor the following code.
   Argument& out = getOutput();
   MatrixPtr outV = getOutputValue();
 
@@ -78,38 +91,49 @@ void BlockExpandLayer::forward(PassType passType) {
   int* start = out.sequenceStartPositions->getMutableData(false);
   int* dims = out.cpuSequenceDims->getData();
   for (size_t i = 0; i < batchSize; i++) {
-    outVTrans_->zeroMem();
-    /* expand each block as one row */
-    MatrixPtr inputTmp =
-        Matrix::create(input->getData() + i * input->getWidth(),
-                       1,
-                       input->getWidth(),
-                       false,
-                       useGpu_);
-    outVTrans_->convExpand(*inputTmp,
-                           imgSizeH_,
-                           imgSizeW_,
-                           channels_,
-                           blockH_,
-                           blockW_,
-                           strideH_,
-                           strideW_,
-                           paddingH_,
-                           paddingW_,
-                           outputH_,
-                           outputW_);
-    MatrixPtr outVTmp =
-        Matrix::create(outV->getData() + i * blockNum * blockSize,
-                       blockNum,
-                       blockSize,
-                       false,
-                       useGpu_);
-    outVTrans_->transpose(outVTmp, false);
+    if (useGpu_) {
+      outVTrans_->zeroMem();
+      /* expand each block as one row */
+      MatrixPtr inputTmp =
+          Matrix::create(input->getData() + i * input->getWidth(),
+                         1,
+                         input->getWidth(),
+                         false,
+                         useGpu_);
+      outVTrans_->convExpand(*inputTmp,
+                             imgSizeH_,
+                             imgSizeW_,
+                             channels_,
+                             blockH_,
+                             blockW_,
+                             strideH_,
+                             strideW_,
+                             paddingH_,
+                             paddingW_,
+                             outputH_,
+                             outputW_);
+      MatrixPtr outVTmp =
+          Matrix::create(outV->getData() + i * blockNum * blockSize,
+                         blockNum,
+                         blockSize,
+                         false,
+                         useGpu_);
+      outVTrans_->transpose(outVTmp, false);
+    }
     start[i] = i * blockNum;
     dims[2 * i] = outputH_;
     dims[2 * i + 1] = outputW_;
   }
   start[batchSize] = batchSize * blockNum;
+  if (!useGpu_) {
+    TensorShape inputShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+    TensorShape outputShape({batchSize, blockNum, blockSize});
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getInputValue(0), inputShape);
+    outputs.addArg(*getOutputValue(), outputShape, ASSIGN_TO);
+    forward_[0]->calc(inputs, outputs);
+  }
 }
 
 void BlockExpandLayer::backward(const UpdateCallback& callback) {