diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h
index 25d2e220bfb938597cb9c5d686b7f65851348d40..f724643f35af909b9557fe2282f23d4c062c8e8e 100644
--- a/paddle/function/GemmConvOp.h
+++ b/paddle/function/GemmConvOp.h
@@ -44,6 +44,7 @@ enum ColFormat { kCFO = 0, kOCF = 1 };
  *                 input_channels,
  *                 filter_height,
  *                 filter_width]
+ * TODO(hedaoyuan): Refactor the arguments of the interface with TensorShape.
  */
 template <ColFormat Format, DeviceType Device, class T>
 class Im2ColFunctor {
diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp
index 0c10f30bbd9c129a949b660fb1a2e8122ea18597..4d8c25ffcdafa3dac0d239fa39b28d9714ebf611 100644
--- a/paddle/function/ImageExpandOp.cpp
+++ b/paddle/function/ImageExpandOp.cpp
@@ -70,16 +70,67 @@ public:
   }
 };
 
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* colData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* imData) {
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset =
+                  outputH * strideHeight + filterH - paddingHeight;
+              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset >= 0 && imRowOffset < inputHeight &&
+                  imColOffset >= 0 && imColOffset < inputWidth) {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                imData[imDataOffset] += colData[colDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
 /*
  * \brief Converts the image data of four dimensions(NCHW) into
- *        a sequence data of three dimensions(NST). Where N is batch size,
- *        S is the length of the sequence after each image is expanded,
- *        T is the size of each time step in the sequence.
+ *        a sequence data of three dimensions(NST) in the forward calculation,
+ *        which is reversed in the backward calculation.
+ *        Where N is batch size, S is the length of the sequence after each
+ *        image is expanded, T is the size of each time step in the sequence.
  *
+ * Arguments in forward function:
  * \param inputs[0]  Image data of NCHW format.
  * \param outputs[0] Sequence data of NST format.
+ *
+ * Arguments in backward function:
+ * \param inputs[0]  Sequence data of NST format.
+ * \param outputs[0] Image data of NCHW format.
  */
-template <DeviceType Device>
 class ImageExpandFunction : public FunctionBase {
 public:
   void init(const FuncConfig& config) override {
@@ -93,25 +144,27 @@ public:
     numOutputs_ = 1;
   }
 
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& output = outputs[0].shape();
-    // input argument should be 4-dimensional.
-    CHECK_EQ(input.ndims(), (size_t)4);
-    // output argument should be 3-dimensional.
-    CHECK_EQ(output.ndims(), (size_t)3);
-    // The batchSize of the input needs to be equal to
-    // the batchSize of the output.
-    CHECK_EQ(input[0], output[0]);
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t seqLength = output[1];
-    size_t stepSize = output[2];
+  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
+
+  void check(const TensorShape& image, const TensorShape& sequence) {
+    // image shape should be 4-dimensional.
+    CHECK_EQ(image.ndims(), (size_t)4);
+    // sequence shape should be 3-dimensional.
+    CHECK_EQ(sequence.ndims(), (size_t)3);
+    // The batchSize of the image needs to be equal to
+    // the batchSize of the sequence.
+    CHECK_EQ(image[0], sequence[0]);
+  }
+
+  // Calculate the shape of colData based on the shape of the image
+  // and the shape of the sequence.
+  TensorShape getColShape(const TensorShape& image,
+                          const TensorShape& sequence) {
+    size_t inputChannels = image[1];
+    size_t inputHeight = image[2];
+    size_t inputWidth = image[3];
+    size_t seqLength = sequence[1];
+    size_t stepSize = sequence[2];
     size_t outputHeight =
         1 +
         (inputHeight + 2 * paddingH() - blockH() + strideH() - 1) / strideH();
@@ -121,8 +174,59 @@ public:
     CHECK_EQ(seqLength, outputHeight * outputWidth);
     CHECK_EQ(stepSize, inputChannels * blockH() * blockW());
 
-    real* inputData = inputs[0].data<real>();
-    real* outputData = outputs[0].data<real>();
+    // [output_height, output_width,
+    // input_channels, filter_height, filter_width]
+    return TensorShape({outputHeight,
+                        outputWidth,
+                        inputChannels,
+                        (size_t)blockH(),
+                        (size_t)blockW()});
+  }
+
+protected:
+  std::vector<size_t> strides_;
+  std::vector<size_t> paddings_;
+  std::vector<size_t> blocks_;
+
+  inline int strideH() const { return strides_[0]; }
+
+  inline int strideW() const { return strides_[1]; }
+
+  inline int paddingH() const { return paddings_[0]; }
+
+  inline int paddingW() const { return paddings_[1]; }
+
+  inline int blockH() const { return blocks_[0]; }
+
+  inline int blockW() const { return blocks_[1]; }
+};
+
+template <DeviceType Device>
+class ImageExpandForward : public ImageExpandFunction {
+public:
+  void init(const FuncConfig& config) override {
+    ImageExpandFunction::init(config);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    const TensorShape& image = inputs[0].shape();
+    const TensorShape& sequence = outputs[0].shape();
+    check(image, sequence);
+
+    TensorShape colShape = getColShape(image, sequence);
+    size_t batchSize = image[0];
+    size_t inputChannels = image[1];
+    size_t inputHeight = image[2];
+    size_t inputWidth = image[3];
+    size_t seqLength = sequence[1];
+    size_t stepSize = sequence[2];
+    size_t outputHeight = colShape[0];
+    size_t outputWidth = colShape[1];
+
+    real* imageData = inputs[0].data<real>();
+    real* seqData = outputs[0].data<real>();
     Im2ColFunctor<kOCF, Device, real> im2col;
     for (size_t i = 0; i < batchSize; i++) {
       // The result of im2col is [output_height, output_width,
@@ -130,7 +234,7 @@ public:
       // reshape into [seqLength, stepSize], where seqLength is equal
       // output_height * output_width, stepSize is equal
       // input_channels * filter_height * filter_width
-      im2col(inputData,
+      im2col(imageData,
              inputChannels,
              inputHeight,
              inputWidth,
@@ -142,30 +246,64 @@ public:
              paddingW(),
              outputHeight,
              outputWidth,
-             outputData);
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputData += seqLength * stepSize;
+             seqData);
+      imageData += inputChannels * inputHeight * inputWidth;
+      seqData += seqLength * stepSize;
     }
   }
+};
 
-protected:
-  std::vector<size_t> strides_;
-  std::vector<size_t> paddings_;
-  std::vector<size_t> blocks_;
-
-  inline int strideH() const { return strides_[0]; }
-
-  inline int strideW() const { return strides_[1]; }
-
-  inline int paddingH() const { return paddings_[0]; }
+template <DeviceType Device>
+class ImageExpandBackward : public ImageExpandFunction {
+public:
+  void init(const FuncConfig& config) override {
+    ImageExpandFunction::init(config);
+  }
 
-  inline int paddingW() const { return paddings_[1]; }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& image = outputs[0].shape();
+    const TensorShape& sequence = inputs[0].shape();
+    check(image, sequence);
 
-  inline int blockH() const { return blocks_[0]; }
+    TensorShape colShape = getColShape(image, sequence);
+    size_t batchSize = image[0];
+    size_t inputChannels = image[1];
+    size_t inputHeight = image[2];
+    size_t inputWidth = image[3];
+    size_t seqLength = sequence[1];
+    size_t stepSize = sequence[2];
+    size_t outputHeight = colShape[0];
+    size_t outputWidth = colShape[1];
 
-  inline int blockW() const { return blocks_[1]; }
+    real* imageData = outputs[0].data<real>();
+    real* seqData = inputs[0].data<real>();
+    Col2ImFunctor<kOCF, Device, real> col2im;
+    for (size_t i = 0; i < batchSize; i++) {
+      col2im(seqData,
+             inputChannels,
+             inputHeight,
+             inputWidth,
+             blockH(),
+             blockW(),
+             strideH(),
+             strideW(),
+             paddingH(),
+             paddingW(),
+             outputHeight,
+             outputWidth,
+             imageData);
+      imageData += inputChannels * inputHeight * inputWidth;
+      seqData += seqLength * stepSize;
+    }
+  }
 };
 
-REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandFunction);
+REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandForward);
+REGISTER_TYPED_FUNC(ImageExpandGrad, CPU, ImageExpandBackward);
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index 9760d39bb4a3c8f94a2ffbd1fbd2ff7438298b9b..c8d0b21c8754d6d9e0643433e73eb2c4e4026267 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -47,6 +47,12 @@ bool BlockExpandLayer::init(const LayerMap& layerMap,
                        .set("strides", strides)
                        .set("paddings", paddings)
                        .set("blocks", blocks));
+    createFunction(backward_,
+                   "ImageExpandGrad",
+                   FuncConfig()
+                       .set("strides", strides)
+                       .set("paddings", paddings)
+                       .set("blocks", blocks));
   }
 
   return true;
@@ -126,12 +132,12 @@ void BlockExpandLayer::forward(PassType passType) {
   }
   start[batchSize] = batchSize * blockNum;
   if (!useGpu_) {
-    TensorShape inputShape({batchSize, channels_, imgSizeH_, imgSizeW_});
-    TensorShape outputShape({batchSize, blockNum, blockSize});
+    inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+    outputShape_ = TensorShape({batchSize, blockNum, blockSize});
     BufferArgs inputs;
     BufferArgs outputs;
-    inputs.addArg(*getInputValue(0), inputShape);
-    outputs.addArg(*getOutputValue(), outputShape, ASSIGN_TO);
+    inputs.addArg(*getInputValue(0), inputShape_);
+    outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
     forward_[0]->calc(inputs, outputs);
   }
 }
@@ -144,41 +150,50 @@ void BlockExpandLayer::backward(const UpdateCallback& callback) {
   if (!preGrad) {
     return;
   }
-  MatrixPtr grad = getOutputGrad();
-  MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_);
-  size_t batchSize = preGrad->getHeight();
 
-  CHECK_EQ(batchSize * blockNum, grad->getHeight());
-  CHECK_EQ(blockSize, grad->getWidth());
+  if (useGpu_) {
+    MatrixPtr grad = getOutputGrad();
+    MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_);
+    size_t batchSize = preGrad->getHeight();
 
-  for (size_t i = 0; i < batchSize; i++) {
-    MatrixPtr gradTmp =
-        Matrix::create(grad->getData() + i * blockNum * blockSize,
-                       blockNum,
-                       blockSize,
-                       false,
-                       useGpu_);
-    gradTmp->transpose(gradTrans, false);
-    MatrixPtr preGradTmp =
-        Matrix::create(preGrad->getData() + i * preGrad->getWidth(),
-                       1,
-                       preGrad->getWidth(),
-                       false,
-                       useGpu_);
-    preGradTmp->convShrink(*gradTrans,
-                           imgSizeH_,
-                           imgSizeW_,
-                           channels_,
-                           blockH_,
-                           blockW_,
-                           strideH_,
-                           strideW_,
-                           paddingH_,
-                           paddingW_,
-                           outputH_,
-                           outputW_,
-                           1.0,
-                           1.0);
+    CHECK_EQ(batchSize * blockNum, grad->getHeight());
+    CHECK_EQ(blockSize, grad->getWidth());
+
+    for (size_t i = 0; i < batchSize; i++) {
+      MatrixPtr gradTmp =
+          Matrix::create(grad->getData() + i * blockNum * blockSize,
+                         blockNum,
+                         blockSize,
+                         false,
+                         useGpu_);
+      gradTmp->transpose(gradTrans, false);
+      MatrixPtr preGradTmp =
+          Matrix::create(preGrad->getData() + i * preGrad->getWidth(),
+                         1,
+                         preGrad->getWidth(),
+                         false,
+                         useGpu_);
+      preGradTmp->convShrink(*gradTrans,
+                             imgSizeH_,
+                             imgSizeW_,
+                             channels_,
+                             blockH_,
+                             blockW_,
+                             strideH_,
+                             strideW_,
+                             paddingH_,
+                             paddingW_,
+                             outputH_,
+                             outputW_,
+                             1.0,
+                             1.0);
+    }
+  } else {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getOutputGrad(), outputShape_);
+    outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
+    backward_[0]->calc(inputs, outputs);
   }
 }
 
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index 8f347400e60ec84fc1b5fdbc1c911a8768b306d0..edda0e0b630ae8e1d805a39f3e5d4cb50649570b 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -53,6 +53,9 @@ protected:
   /// auxiliary variable, which saves the transposed output value.
   MatrixPtr outVTrans_;
 
+  TensorShape inputShape_;
+  TensorShape outputShape_;
+
 public:
   explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}