diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 6d76e229bfc21f7e0c3c681c30ec189b70144be0..48e2e32f9256fb49c67ba25e9b5a47d72499758b 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include "TensorShape.h"
+#include "TensorType.h"
+
 namespace paddle {
 
 /* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1dac2585db7213592ba1286f1f894343502f122a
--- /dev/null
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+
+namespace paddle {
+
+template<class T>
+__global__
+void im2colOCF(const T* imData, T* colData,
+               int inputChannels,
+               int inputHeight, int inputWidth,
+               int filterHeight, int filterWidth,
+               int strideHeight, int strideWidth,
+               int paddingHeight, int paddingWidth,
+               int outputHeight, int outputWidth) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int swId = blockIdx.x;
+  int shId = blockIdx.y;
+
+  for (int channelId = threadIdx.z;
+       channelId < inputChannels;
+       channelId += blockDim.z) {
+    int widthOffset = idx + swId * strideWidth - paddingWidth;
+    int heightOffset = idy + shId * strideHeight - paddingHeight;
+    int imOffset = widthOffset + heightOffset * inputWidth
+       + channelId * inputHeight * inputWidth;
+
+    int colOffset = idx + idy * filterWidth
+      + channelId * filterHeight * filterWidth
+      + (shId * outputWidth + swId)
+      * (inputChannels * filterHeight * filterWidth);
+
+    if (idx < filterWidth && idy < filterHeight) {
+      if (heightOffset >= inputHeight || heightOffset < 0 ||
+          widthOffset >= inputWidth || widthOffset < 0) {
+        colData[colOffset] = T(0);
+      } else {
+        colData[colOffset] = imData[imOffset];
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int blockDimX = 0;
+    int blockDimY = 0;
+    if (filterHeight <= 4 && filterWidth <= 4) {
+      blockDimX = 4;
+      blockDimY = 4;
+    } else if (filterHeight <= 8 && filterWidth <= 8) {
+      blockDimX = 8;
+      blockDimY = 8;
+    } else if (filterHeight <= 16 && filterWidth <= 16) {
+      blockDimX = 16;
+      blockDimY = 16;
+    } else {
+      blockDimX = 32;
+      blockDimY = 32;
+    }
+
+    int blockDimZ = 1024 / blockDimX / blockDimY;
+    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
+    dim3 grid(outputWidth, outputHeight);
+    im2colOCF<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+        (imData, colData, inputChannels, inputHeight, inputWidth,
+         filterHeight, filterWidth, strideHeight, strideWidth,
+         paddingHeight, paddingWidth, outputHeight, outputWidth);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+  }
+};
+
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp
index ad34967bd65808361a38d1e5b0cc0042ea2df8c8..fe4c8fefcf5e84763afb8500c749601a19aa1266 100644
--- a/paddle/function/ImageExpandOp.cpp
+++ b/paddle/function/ImageExpandOp.cpp
@@ -291,5 +291,8 @@ public:
 
 REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandForward);
 REGISTER_TYPED_FUNC(ImageExpandGrad, CPU, ImageExpandBackward);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(ImageExpand, GPU, ImageExpandForward);
+#endif
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index c8d0b21c8754d6d9e0643433e73eb2c4e4026267..1889b347c2d89c54999d016b70c35b1559900e51 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -37,16 +37,16 @@ bool BlockExpandLayer::init(const LayerMap& layerMap,
   imgSizeH_ = blockConf.img_size_y();
   imgSizeW_ = blockConf.img_size_x();
 
+  std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
+  std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
+  std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
+  createFunction(forward_,
+                 "ImageExpand",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
   if (!useGpu_) {
-    std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
-    std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
-    std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
-    createFunction(forward_,
-                   "ImageExpand",
-                   FuncConfig()
-                       .set("strides", strides)
-                       .set("paddings", paddings)
-                       .set("blocks", blocks));
     createFunction(backward_,
                    "ImageExpandGrad",
                    FuncConfig()
@@ -84,62 +84,29 @@ void BlockExpandLayer::forward(PassType passType) {
   size_t blockNum = getBlockNum();
   size_t blockSize = blockH_ * blockW_ * channels_;
   resetOutput(blockNum * batchSize, blockSize);
-  // TODO(hedaoyuan): After completing the GPU version of ImageExpand,
-  // refactor the following code.
-  Argument& out = getOutput();
-  MatrixPtr outV = getOutputValue();
 
-  MatrixPtr input = getPrev(0)->getOutputValue();
-  Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_);
+  // calculate output_.value
+  inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+  outputShape_ = TensorShape({batchSize, blockNum, blockSize});
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inputShape_);
+  outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+
+  // calculate output_.sequenceStartPositions and output_.cpuSequenceDims
+  Argument& out = getOutput();
   ICpuGpuVector::resizeOrCreate(
       out.sequenceStartPositions, batchSize + 1, false);
   IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
   int* start = out.sequenceStartPositions->getMutableData(false);
   int* dims = out.cpuSequenceDims->getData();
   for (size_t i = 0; i < batchSize; i++) {
-    if (useGpu_) {
-      outVTrans_->zeroMem();
-      /* expand each block as one row */
-      MatrixPtr inputTmp =
-          Matrix::create(input->getData() + i * input->getWidth(),
-                         1,
-                         input->getWidth(),
-                         false,
-                         useGpu_);
-      outVTrans_->convExpand(*inputTmp,
-                             imgSizeH_,
-                             imgSizeW_,
-                             channels_,
-                             blockH_,
-                             blockW_,
-                             strideH_,
-                             strideW_,
-                             paddingH_,
-                             paddingW_,
-                             outputH_,
-                             outputW_);
-      MatrixPtr outVTmp =
-          Matrix::create(outV->getData() + i * blockNum * blockSize,
-                         blockNum,
-                         blockSize,
-                         false,
-                         useGpu_);
-      outVTrans_->transpose(outVTmp, false);
-    }
     start[i] = i * blockNum;
     dims[2 * i] = outputH_;
     dims[2 * i + 1] = outputW_;
   }
   start[batchSize] = batchSize * blockNum;
-  if (!useGpu_) {
-    inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
-    outputShape_ = TensorShape({batchSize, blockNum, blockSize});
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*getInputValue(0), inputShape_);
-    outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
-    forward_[0]->calc(inputs, outputs);
-  }
 }
 
 void BlockExpandLayer::backward(const UpdateCallback& callback) {
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index edda0e0b630ae8e1d805a39f3e5d4cb50649570b..15ce73ab8b2ca16ba1e9329ed5c00dc7239e8b93 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -50,9 +50,6 @@ protected:
   size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
   size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
 
-  /// auxiliary variable, which saves the transposed output value.
-  MatrixPtr outVTrans_;
-
   TensorShape inputShape_;
   TensorShape outputShape_;