refactored ExpandConvLayer and ExpandConvTransLayer with ConvBaseLayerCpu

2575b74f · wangyang59 · aa2cd2ce · 2575b74f · 2575b74f · 2575b74f
10 changed file
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -78,12 +78,7 @@ protected:
  /// of output size.
  bool caffeMode_;
-  /*The expandInput_ and transOutValue_ are used for CPU expand conv calc*/
-  /// Expand one sample at a time. shape:
-  /// (numChannels * filterPixels_, outputSizeH * outputSizeW)
-  MatrixPtr expandInput_;
-  /// The transpose of output, which is an auxiliary matrix.
-  MatrixPtr transOutValue_;
 public:
  explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
@@ -135,26 +130,6 @@ public:
    CHECK_GE(imageSize, 1);
    return imageSize;
  }
-  /**
-   * Create or resize expandInput_.
-   */
-  void resetExpandInput(size_t height, size_t width);
-  /**
-   * Create or resize transOutValue_.
-   */
-  void resetConvOutput(size_t batchSize, int inIdx);
-  /**
-   * Add shared bias.
-   */
-  void addSharedBias();
-  /**
-   * Add unshared bias.
-   */
-  void addUnsharedBias();
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/ConvBaseLayerCpu.cpp
+++ b/paddle/gserver/layers/ConvBaseLayerCpu.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/utils/Logging.h"
+#include "ConvBaseLayerCpu.h"
+namespace paddle {
+bool ConvBaseLayerCpu::init(const LayerMap &layerMap,
+                           const ParameterMap &parameterMap) {
+  /* Initialize the basic convolutional parent class */
+  ConvBaseLayer::init(layerMap, parameterMap);
+  int channel;
+  /* Initialize the projection */
+  for (auto &inputConfig : config_.inputs()) {
+    const ConvConfig &conf = inputConfig.conv_conf();
+    subM_.push_back(numFilters_ / conf.groups());
+    subN_.push_back(conf.output_x() * conf.output_x());
+    channel = isConv_ ? conf.channels() : numFilters_;
+    subK_.push_back(channel * conf.filter_size() * conf.filter_size() /
+                    conf.groups());
+    /* Consistent caffe mode for multiple input */
+    caffeMode_ = conf.caffe_mode();
+  }
+  return true;
+}
+void ConvBaseLayerCpu::resetExpandInput(size_t height, size_t width) {
+  Matrix::resizeOrCreate(expandInput_, height, width, false, useGpu_);
+}
+void ConvBaseLayerCpu::addSharedBias() {
+  size_t mapW = getSize() / numFilters_;
+  size_t mapH = getOutputValue()->getElementCnt() / mapW;
+  MatrixPtr out =
+      Matrix::create(getOutputValue()->getData(), mapH, mapW, false, useGpu_);
+  Matrix::resizeOrCreate(transOutValue_, mapW, mapH, false, useGpu_);
+  out->transpose(transOutValue_, false);  // false means no memory allocation
+  transOutValue_->reshape(transOutValue_->getElementCnt() / numFilters_,
+                          numFilters_);
+  MatrixPtr bias =
+      Matrix::create(biases_->getW()->getData(), 1,
+                     biases_->getW()->getElementCnt(), false, useGpu_);
+  transOutValue_->addBias(*bias, 1.0f);
+  transOutValue_->reshape(mapW, mapH);
+  transOutValue_->transpose(out, false);  // false means no memory allocation
+  out->clear();
+  bias->clear();
+}
+void ConvBaseLayerCpu::addUnsharedBias() {
+  MatrixPtr outValue = getOutputValue();
+  MatrixPtr bias =
+      Matrix::create(biases_->getW()->getData(), 1,
+                     biases_->getW()->getElementCnt(), false, useGpu_);
+  outValue->addBias(*bias, 1.0f);
+}
+void ConvBaseLayerCpu::expandOneFrame(MatrixPtr image, size_t startIdx,
+                                     int inIdx) {
+  int channel = isConv_ ? channels_[inIdx] : numFilters_;
+  resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
+  real *imgData = image->getData() + startIdx * image->getWidth();
+  MatrixPtr imageTmp = Matrix::create(
+      imgData, 1, imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel, false,
+      useGpu_);
+  expandInput_->convExpand(*imageTmp, imgSizeH_[inIdx], imgSizeW_[inIdx],
+                           channel, filterSize_[inIdx],
+                           filterSize_[inIdx], stride_[inIdx], stride_[inIdx],
+                           padding_[inIdx], padding_[inIdx],
+                           outputH_[inIdx], outputW_[inIdx]);
+  imageTmp->clear();
+}
+void ConvBaseLayerCpu::expandFwdOnce(MatrixPtr image, MatrixPtr out,
+                                     int inIdx, int startIdx) {
+  int subM = subM_[inIdx];
+  int subN = subN_[inIdx];
+  int subK = subK_[inIdx];
+  expandOneFrame(image, startIdx, inIdx);
+  int nf = isConv_ ? numFilters_ : channels_[inIdx];
+  real *outData =
+      out->getData() + startIdx * subN * nf;
+  real *wgtData = weights_[inIdx]->getW()->getData();
+  real *expInData = expandInput_->getData();
+  for (int g = 0; g < groups_[inIdx]; ++g) {
+    MatrixPtr A =
+        Matrix::create(wgtData, subK, subM, true, useGpu_);  // mark transpose
+    MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
+    MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
+    C->mul(A, B, 1, 1);
+    A->clear();
+    B->clear();
+    C->clear();
+    wgtData += subK * subM;
+    expInData += subK * subN;
+    outData += subM * subN;
+  }
+}
+void ConvBaseLayerCpu::bpropActs(MatrixPtr image, MatrixPtr out, int inpIdx) {
+  int channel = isConv_ ? channels_[inpIdx] : numFilters_;
+  int subM = subM_[inpIdx];
+  int subN = subN_[inpIdx];
+  int subK = subK_[inpIdx];
+  size_t batchSize = image->getHeight();
+  MatrixPtr tgtGrad = out;
+  /* reset the expand-grad memory */
+  resetExpandInput(subK * groups_[inpIdx], subN);
+  real *localGradData = image->getData();
+  real *tgtGradData = tgtGrad->getData();
+  for (size_t n = 0; n < batchSize; n++) {
+    real *wgtData = weights_[inpIdx]->getW()->getData();
+    real *expandInData = expandInput_->getData();
+    for (int g = 0; g < groups_[inpIdx]; g++) {
+      // create temporary matrix
+      MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
+      MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
+      MatrixPtr A = Matrix::create(wgtData, subK, subM, false, useGpu_);
+      C->mul(A, B);  // mul
+      // clear the temporary matrix
+      A->clear();
+      B->clear();
+      C->clear();
+      expandInData += subK * subN;
+      localGradData += subM * subN;
+      wgtData += subK * subM;
+    }
+    // shrink one frame outGrad
+    MatrixPtr oneGradTmp = Matrix::create(
+        expandInput_->getData(), subK * groups_[inpIdx], subN, false, useGpu_);
+    MatrixPtr vTmp = Matrix::create(
+        tgtGradData, 1,
+        imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel, false,
+        useGpu_);
+    vTmp->convShrink(*oneGradTmp, imgSizeH_[inpIdx], imgSizeW_[inpIdx],
+                     channel, filterSize_[inpIdx],
+                     filterSize_[inpIdx], stride_[inpIdx], stride_[inpIdx],
+                     padding_[inpIdx], padding_[inpIdx],
+                     outputH_[inpIdx], outputW_[inpIdx], 1.0f, 1.0f);
+    vTmp->clear();
+    oneGradTmp->clear();
+    // move the data-pointer
+    tgtGradData += imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel;
+  }
+}
+void ConvBaseLayerCpu::bpropWeights(MatrixPtr image, MatrixPtr out,
+                                    int inpIdx) {
+  MatrixPtr weightGrad = weights_[inpIdx]->getWGrad();
+  int subM = subM_[inpIdx];
+  int subN = subN_[inpIdx];
+  int subK = subK_[inpIdx];
+  size_t batchSize = image->getHeight();
+  resetExpandInput(subK * groups_[inpIdx], subN);
+  real *gradData = out->getData();
+  for (size_t n = 0; n < batchSize; n++) {  // frame by frame
+    // expand
+    expandOneFrame(image, n, inpIdx);
+    real *wGradData = weightGrad->getData();
+    real *expandInData = expandInput_->getData();
+    // expand-mul one-group by one
+    for (int g = 0; g < groups_[inpIdx]; g++) {
+      MatrixPtr A = Matrix::create(expandInData, subK, subN, false, useGpu_);
+      MatrixPtr B = Matrix::create(gradData, subM, subN, true, useGpu_);
+      MatrixPtr C = Matrix::create(wGradData, subK, subM, false, useGpu_);
+      C->mul(A, B, 1, 1);
+      A->clear();
+      B->clear();
+      C->clear();
+      gradData += subM * subN;
+      wGradData += subK * subM;
+      expandInData += subK * subN;
+    }
+  }
+}
+void ConvBaseLayerCpu::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
+  size_t mapW = getSize() / numFilters_;
+  size_t mapH = v->getElementCnt() / mapW;
+  MatrixPtr vTmp = Matrix::create(v->getData(), mapH, mapW, false, useGpu_);
+  Matrix::resizeOrCreate(transOutValue_, mapW, mapH, false, useGpu_);
+  vTmp->transpose(transOutValue_, false);  // false means no memory allocation
+  transOutValue_->reshape(transOutValue_->getElementCnt() / numFilters_,
+                          numFilters_);
+  biases->collectBias(*transOutValue_, 1.0f);
+}
+void ConvBaseLayerCpu::bpropBiases(MatrixPtr v) {
+  MatrixPtr biases =
+      Matrix::create(biases_->getWGrad()->getData(), 1,
+                     biases_->getWGrad()->getElementCnt(), false, useGpu_);
+  if (sharedBiases_) {
+    bpropSharedBias(biases, v);
+  } else {
+    biases->collectBias(*v, 1.0f);
+  }
+  biases->clear();
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvBaseLayerCpu.h
+++ b/paddle/gserver/layers/ConvBaseLayerCpu.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "ConvBaseLayer.h"
+#include "paddle/math/Matrix.h"
+#include <vector>
+namespace paddle {
+/**
+ * @brief A subclass of ConvBaseLayer that is a superclass of both
+ * ExpandConvLayer and ExpandConvTransLayer
+ */
+class ConvBaseLayerCpu : public ConvBaseLayer {
+protected:
+  /// For expand convolution.
+  /// subM_ = numFilters_ / groups_.
+  IntV subM_;
+  /// subN_ = outputH_ * outputW_.
+  IntV subN_;
+  /// subK_ = channels_ * filterPixels_ * groups_.
+  IntV subK_;
+  /// The spatial dimensions of height of input feature map.
+  IntV imgSizeH_;
+  /// The spatial dimensions of width of input feature map.
+  IntV imgSizeW_;
+  /// The spatial dimensions of height of output feature map.
+  IntV outputH_;
+  /// The spatial dimensions of width of output feature map.
+  IntV outputW_;
+  /*The expandInput_ and transOutValue_ are used for CPU expand conv calc*/
+  /// Expand one sample at a time. shape:
+  /// (numChannels * filterPixels_, outputSizeH * outputSizeW)
+  MatrixPtr expandInput_;
+  /// The transpose of output, which is an auxiliary matrix.
+  MatrixPtr transOutValue_;
+public:
+  explicit ConvBaseLayerCpu(const LayerConfig& config)
+    : ConvBaseLayer(config) {}
+  ~ConvBaseLayerCpu() {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  /**
+   * Create or resize expandInput_.
+   */
+  void resetExpandInput(size_t height, size_t width);
+  /**
+   * Add shared bias.
+   */
+  void addSharedBias();
+  /**
+   * Add unshared bias.
+   */
+  void addUnsharedBias();
+  /**
+   * Expand one input sample.
+   */
+  void expandOneFrame(MatrixPtr image, size_t startIdx, int inIdx);
+  /**
+   * Expand one input sample and perform matrix multiplication.
+   */
+  void expandFwdOnce(MatrixPtr image, MatrixPtr out, int inIdx, int startIdx);
+  void bpropSharedBias(MatrixPtr biases, MatrixPtr v);
+  void bpropBiases(MatrixPtr v);
+  void bpropWeights(MatrixPtr image, MatrixPtr out, int inpIdx);
+  void bpropActs(MatrixPtr image, MatrixPtr out, int inpIdx);
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvTransBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvTransBaseLayer.cpp
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/utils/Logging.h"
-#include "ConvTransBaseLayer.h"
-namespace paddle {
-bool ConvTransBaseLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  /* Initialize the convolutional layer parameter */
-  /* Everything is the same as ConvBaseLayer.cpp except that the meaning of
-   * num_filters and channel is switched.
-   *
-   * In the config, num_filters refer to the number of feature maps in the
-   * output of convTransLayer, and channel refer to the number of feature maps
-   * in the input of convTransLayer.
-   *
-   * However, within the convTrans class, the channel is related to the output
-   * and num_filters is related to the input, so that it is consistent with the
-   * settings in convLayer.
-   * */
-  channel_ = config_.num_filters();
-  sharedBiases_ = config_.shared_biases();
-  for (auto& inputConfig : config_.inputs()) {
-    const ConvConfig& conf = inputConfig.conv_conf();
-    padding_.push_back(conf.padding());
-    stride_.push_back(conf.stride());
-    filterSize_.push_back(conf.filter_size());
-    paddingY_.push_back(conf.padding_y());
-    strideY_.push_back(conf.stride_y());
-    filterSizeY_.push_back(conf.filter_size_y());
-    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
-    numFilters_.push_back(conf.channels());
-    imgSize_.push_back(conf.img_size());
-    imgPixels_.push_back(imgSize_.back() * imgSize_.back());
-    groups_.push_back(conf.groups());
-    filterChannels_.push_back(conf.filter_channels());
-    outputX_.push_back(conf.output_x());
-    outputs_.push_back(outputX_.back() * outputX_.back());
-  }
-  /* initialize the weightList */
-  CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = numFilters_[i];
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
-  }
-  /* initialize the biases_ */
-  if (biasParameter_.get() != NULL) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)channel_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(channel_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
-  // default caffe model
-  caffeMode_ = true;
-  return true;
-}
-}  // namespace paddle
--- a/paddle/gserver/layers/ConvTransBaseLayer.h
+++ b/paddle/gserver/layers/ConvTransBaseLayer.h
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "Layer.h"
-namespace paddle {
-/**
- * @brief A Base Convolution Layer, which convolves the input image
- * with learned filters and (optionally) adds biases.
- */
-class ConvTransBaseLayer : public Layer {
-protected:
-  typedef std::vector<int> IntV;
-  /// The number of channel in image (the output of the deconv layer).
-  int channel_;
-  /// The x dimension of the padding.
-  IntV padding_;
-  /// The y dimension of the padding.
-  IntV paddingY_;
-  /// The x dimension of the stride.
-  IntV stride_;
-  /// The y dimension of the stride.
-  IntV strideY_;
-  /// The x dimension of a filter kernel.
-  IntV filterSize_;
-  /// The y dimension of a filter kernel.
-  IntV filterSizeY_;
-  /// The number of filters(i.e. the number channels of the deconv layer input)
-  IntV numFilters_;
-  /// The spatial dimensions of input feature map.
-  IntV imgSize_;
-  /// The total pixel size of input feature map.
-  /// imgPixels_ = imgSizeX_ * imgSizeY_.
-  IntV imgPixels_;
-  /// filterPixels_ = filterSizeX_ * filterSizeY_.
-  IntV filterPixels_;
-  /// filterChannels_ = channels_/groups_.
-  IntV filterChannels_;
-  /// The spatial dimensions of output feature map.
-  IntV outputX_;
-  /// The spatial dimensions of output feature map.
-  IntV outputs_;
-  /// Group size, refer to grouped convolution in
-  /// Alex Krizhevsky's paper: when group=2, the first half of the
-  /// filters are only connected to the first half of the input channels,
-  /// and the second half only connected to the second half.
-  IntV groups_;
-  /// Whether the bias is shared for feature in each channel.
-  bool sharedBiases_;
-  /// shape of weight: (numChannels * filterPixels_, numFilters)
-  WeightList weights_;
-  /// If shared_biases is false shape of bias: (numFilters_, 1)
-  /// If shared_biases is ture shape of bias:
-  /// (numFilters_ * outputX * outputY, 1)
-  std::unique_ptr<Weight> biases_;
-  /// True by default. The only difference is the calculation
-  /// of output size.
-  bool caffeMode_;
-public:
-  explicit ConvTransBaseLayer(const LayerConfig& config) : Layer(config) {}
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-  /**
-   * Calculate image size based on caffeMode_ from outputSize.
-   * - input(+padding): 0123456789
-   * - imageSize(+padding) = 10;
-   * - filterSize = 3;
-   * - stride = 2;
-   * - caffeMode_ is true:
-       - output: (012), (234), (456), (678)
-       - outputSize = 4;
-   * - caffeMode_ is false:
-   *   - output: (012), (234), (456), (678), (9)
-   *   - outputSize = 5;
-   */
-  /*
-   * In order to be consistent with the convLayer, here the outputSize is
-   * actually the size of the input image of convTransLayer, and the image size
-   * is actually the size of the output image of convTransLayer
-   */
-  int imageSize(int outputSize, int filterSize, int padding, int stride) {
-    int imageSize;
-    if (!caffeMode_) {
-     imageSize =
-         (outputSize - 1) * stride + filterSize - 2 * padding - stride + 1;
-    } else {
-     imageSize = (outputSize - 1) * stride + filterSize - 2 * padding;
-    }
-    CHECK_GE(imageSize, 1);
-    return imageSize;
-  }
-};
-}  // namespace paddle
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -24,32 +24,7 @@ REGISTER_LAYER(exconv, ExpandConvLayer);
 bool ExpandConvLayer::init(const LayerMap &layerMap,
                           const ParameterMap &parameterMap) {
  /* Initialize the basic convolutional parent class */
-  ConvBaseLayer::init(layerMap, parameterMap);
+  ConvBaseLayerCpu::init(layerMap, parameterMap);
-  /* Initialize the projection */
-  for (auto &inputConfig : config_.inputs()) {
-    const ConvConfig &conf = inputConfig.conv_conf();
-    subM_.push_back(numFilters_ / conf.groups());
-    subN_.push_back(conf.output_x() * conf.output_x());
-    subK_.push_back(conf.channels() * conf.filter_size() * conf.filter_size() /
-                    conf.groups());
-    /* Consistent caffe mode for multiple input */
-    caffeMode_ = conf.caffe_mode();
-  }
-  /* initialize the weightList */
-  CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = numFilters_;
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
-  }
  return true;
 }
@@ -63,72 +38,6 @@ size_t ExpandConvLayer::getOutputSize() {
  return layerSize;
 }
-void ExpandConvLayer::expandOneFrame(MatrixPtr image, size_t startIdx,
-                                     int inIdx) {
-  resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
-  real *imgData = image->getData() + startIdx * image->getWidth();
-  MatrixPtr imageTmp = Matrix::create(
-      imgData, 1, imgSizeH_[inIdx] * imgSizeW_[inIdx] * channels_[inIdx], false,
-      useGpu_);
-  expandInput_->convExpand(*imageTmp, imgSizeH_[inIdx], imgSizeW_[inIdx],
-                           channels_[inIdx], filterSize_[inIdx],
-                           filterSize_[inIdx], stride_[inIdx], stride_[inIdx],
-                           padding_[inIdx], padding_[inIdx],
-                           outputH_[inIdx], outputW_[inIdx]);
-  imageTmp->clear();
-}
-void ExpandConvLayer::expandFwdOnce(MatrixPtr image, int inIdx, int startIdx) {
-  int subM = subM_[inIdx];
-  int subN = subN_[inIdx];
-  int subK = subK_[inIdx];
-  expandOneFrame(image, startIdx, inIdx);
-  real *outData =
-      getOutputValue()->getData() + startIdx * subN * numFilters_;
-  real *wgtData = weights_[inIdx]->getW()->getData();
-  real *expInData = expandInput_->getData();
-  for (int g = 0; g < groups_[inIdx]; ++g) {
-    MatrixPtr A =
-        Matrix::create(wgtData, subK, subM, true, useGpu_);  // mark transpose
-    MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
-    MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
-    C->mul(A, B, 1, 1);
-    A->clear();
-    B->clear();
-    C->clear();
-    wgtData += subK * subM;
-    expInData += subK * subN;
-    outData += subM * subN;
-  }
-}
-void ExpandConvLayer::addSharedBias() {
-  size_t mapW = getOutputValue()->getWidth() / numFilters_;
-  size_t mapH = getOutputValue()->getElementCnt() / mapW;
-  MatrixPtr out =
-      Matrix::create(getOutputValue()->getData(), mapH, mapW, false, useGpu_);
-  Matrix::resizeOrCreate(transOutValue_, mapW, mapH, false, useGpu_);
-  out->transpose(transOutValue_, false);  // false means no memory allocation
-  transOutValue_->reshape(transOutValue_->getElementCnt() / numFilters_,
-                          numFilters_);
-  MatrixPtr bias =
-      Matrix::create(biases_->getW()->getData(), 1,
-                     biases_->getW()->getElementCnt(), false, useGpu_);
-  transOutValue_->addBias(*bias, 1.0f);
-  transOutValue_->reshape(mapW, mapH);
-  transOutValue_->transpose(out, false);  // false means no memory allocation
 void ExpandConvLayer::forward(PassType passType) {
  Layer::forward(passType);
@@ -145,7 +54,7 @@ void ExpandConvLayer::forward(PassType passType) {
    image = prevLayer->getOutputValue();
    for (size_t off = 0; off < image->getHeight(); off++) {
      REGISTER_TIMER_INFO("expandFwdOnce", getName().c_str());
-      expandFwdOnce(image, i, off);
+      expandFwdOnce(image, getOutputValue(), i, off);
    }
  }
  /* add the bias-vector */
@@ -161,29 +70,6 @@ void ExpandConvLayer::forward(PassType passType) {
  forwardActivation();
 }
-void ExpandConvLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
-  size_t mapW = v->getWidth() / numFilters_;
-  size_t mapH = v->getElementCnt() / mapW;
-  MatrixPtr vTmp = Matrix::create(v->getData(), mapH, mapW, false, useGpu_);
-  Matrix::resizeOrCreate(transOutValue_, mapW, mapH, false, useGpu_);
-  vTmp->transpose(transOutValue_, false);  // false means no memory allocation
-  vTmp->reshape(transOutValue_->getElementCnt() / numFilters_, numFilters_);
-  biases->collectBias(*vTmp, 1.0f);
-}
-void ExpandConvLayer::bpropBiases(MatrixPtr v) {
-  MatrixPtr biases =
-      Matrix::create(biases_->getWGrad()->getData(), 1,
-                     biases_->getWGrad()->getElementCnt(), false, useGpu_);
-  if (sharedBiases_) {
-    bpropSharedBias(biases, v);
-  } else {
-    biases->collectBias(*v, 1.0f);
-  }
-  biases->clear();
-}
 void ExpandConvLayer::backward(const UpdateCallback &callback) {
  backwardActivation();
@@ -197,109 +83,16 @@ void ExpandConvLayer::backward(const UpdateCallback &callback) {
  for (size_t i = 0; i != inputLayers_.size(); ++i) {
    /* First, calculate the input layers error */
-    bpropActs(outGrad, i);
+    if (NULL != getPrev(i)->getOutputGrad()) {
+      bpropActs(outGrad, getPrev(i)->getOutputGrad(), i);
+    }
    if (weights_[i]->getWGrad()) {
      /* Then, calculate the W-gradient for the current layer */
-      bpropWeights(outGrad, i);
+      bpropWeights(getPrev(i)->getOutputValue(), outGrad, i);
      /* Increasing the number of gradient */
      weights_[i]->getParameterPtr()->incUpdate(callback);
    }
  }
 }
-void ExpandConvLayer::bpropWeights(MatrixPtr v, int inpIdx) {
-  MatrixPtr weightGrad = weights_[inpIdx]->getWGrad();
-  MatrixPtr inputV = getPrev(inpIdx)->getOutputValue();
-  int subM = subM_[inpIdx];
-  int subN = subN_[inpIdx];
-  int subK = subK_[inpIdx];
-  size_t batchSize = inputV->getHeight();
-  resetExpandInput(subK * groups_[inpIdx], subN);
-  resetConvOutput(batchSize, inpIdx);
-  real *gradData = v->getData();
-  for (size_t n = 0; n < batchSize; n++) {  // frame by frame
-    // expand
-    expandOneFrame(inputV, n, inpIdx);
-    real *wGradData = weightGrad->getData();
-    real *expandInData = expandInput_->getData();
-    // expand-mul one-group by one
-    for (int g = 0; g < groups_[inpIdx]; g++) {
-      MatrixPtr A = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(gradData, subM, subN, true, useGpu_);
-      MatrixPtr C = Matrix::create(wGradData, subK, subM, false, useGpu_);
-      C->mul(A, B, 1, 1);
-      A->clear();
-      B->clear();
-      C->clear();
-      gradData += subM * subN;
-      wGradData += subK * subM;
-      expandInData += subK * subN;
-    }
-  }
-}
-void ExpandConvLayer::bpropActs(MatrixPtr v, int inpIdx) {
-  LayerPtr prevLayer = getPrev(inpIdx);
-  if (NULL == prevLayer->getOutputGrad()) {
-    return;
-  }
-  int subM = subM_[inpIdx];
-  int subN = subN_[inpIdx];
-  int subK = subK_[inpIdx];
-  size_t batchSize = v->getHeight();
-  MatrixPtr tgtGrad = prevLayer->getOutputGrad();
-  /* reset the expand-grad memory */
-  resetExpandInput(subK * groups_[inpIdx], subN);
-  resetConvOutput(batchSize, inpIdx);
-  real *localGradData = v->getData();
-  real *tgtGradData = tgtGrad->getData();
-  for (size_t n = 0; n < batchSize; n++) {
-    real *wgtData = weights_[inpIdx]->getW()->getData();
-    real *expandInData = expandInput_->getData();
-    for (int g = 0; g < groups_[inpIdx]; g++) {
-      // create temporary matrix
-      MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
-      MatrixPtr A = Matrix::create(wgtData, subK, subM, false, useGpu_);
-      C->mul(A, B);  // mul
-      // clear the temporary matrix
-      A->clear();
-      B->clear();
-      C->clear();
-      expandInData += subK * subN;
-      localGradData += subM * subN;
-      wgtData += subK * subM;
-    }
-    // shrink one frame outGrad
-    MatrixPtr oneGradTmp = Matrix::create(
-        expandInput_->getData(), subK * groups_[inpIdx], subN, false, useGpu_);
-    MatrixPtr vTmp = Matrix::create(
-        tgtGradData, 1,
-        imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channels_[inpIdx], false,
-        useGpu_);
-    vTmp->convShrink(*oneGradTmp, imgSizeH_[inpIdx], imgSizeW_[inpIdx],
-                     channels_[inpIdx], filterSize_[inpIdx],
-                     filterSize_[inpIdx], stride_[inpIdx], stride_[inpIdx],
-                     padding_[inpIdx], padding_[inpIdx],
-                     outputH_[inpIdx], outputW_[inpIdx], 1.0f, 1.0f);
-    vTmp->clear();
-    oneGradTmp->clear();
-    // move the data-pointer
-    tgtGradData += imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channels_[inpIdx];
-  }
-}
 }  // namespace paddle
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
-#include "ConvBaseLayer.h"
+#include "ConvBaseLayerCpu.h"
 #include "paddle/math/Matrix.h"
 #include <vector>
@@ -28,24 +28,11 @@ namespace paddle {
 *
 * The config file api is img_conv_layer.
 */
-class ExpandConvLayer : public ConvBaseLayer {
-protected:
-  /// For expand convolution.
-  /// subM_ = numFilters_ / groups_.
-  IntV subM_;
-  /// subN_ = outputH_ * outputW_.
-  IntV subN_;
-  /// subK_ = channels_ * filterPixels_ * groups_.
-  IntV subK_;
-  /// Expand one sample at a time. shape:
-  /// (numChannels * filterPixels_, outputSizeH * outputSizeW)
-  MatrixPtr expandInput_;
-  /// The transpose of output, which is an auxiliary matrix.
-  MatrixPtr transOutValue_;
+class ExpandConvLayer : public ConvBaseLayerCpu {
 public:
-  explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  explicit ExpandConvLayer(const LayerConfig& config) :
+    ConvBaseLayerCpu(config) {}
  ~ExpandConvLayer() {}
@@ -53,23 +40,8 @@ public:
  size_t getOutputSize();
-  /**
-   * Expand one input sample.
-   */
-  void expandOneFrame(MatrixPtr image, size_t startIdx, int inIdx);
-  /**
-   * Expand one input sample and perform matrix multiplication.
-   */
-  void expandFwdOnce(MatrixPtr image, int inIdx, int startIdx);
  void forward(PassType passType);
-  void bpropSharedBias(MatrixPtr biases, MatrixPtr v);
-  void bpropBiases(MatrixPtr v);
  void backward(const UpdateCallback& callback);
-  void bpropWeights(MatrixPtr v, int inpIdx);
-  void bpropActs(MatrixPtr v, int inpIdx);
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/ExpandConvTransLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvTransLayer.cpp
@@ -29,18 +29,7 @@ REGISTER_LAYER(exconvt, ExpandConvTransLayer);
 bool ExpandConvTransLayer::init(const LayerMap &layerMap,
                           const ParameterMap &parameterMap) {
  /* Initialize the basic convolutional parent class */
-  ConvBaseLayer::init(layerMap, parameterMap);
+  ConvBaseLayerCpu::init(layerMap, parameterMap);
-  /* Initialize the projection */
-  for (auto &inputConfig : config_.inputs()) {
-    const ConvConfig &conf = inputConfig.conv_conf();
-    subM_.push_back(conf.channels() / conf.groups());
-    subN_.push_back(conf.output_x() * conf.output_x());
-    subK_.push_back(numFilters_ * conf.filter_size() * conf.filter_size() /
-                    conf.groups());
-    /* Consistent caffe mode for multiple input */
-    caffeMode_ = conf.caffe_mode();
-  }
  return true;
 }
@@ -73,67 +62,6 @@ size_t ExpandConvTransLayer::getSize() {
  return layerSize;
 }
-void ExpandConvTransLayer::resetExpandInput(size_t height, size_t width) {
-  Matrix::resizeOrCreate(expandInput_, height, width, false, useGpu_);
-}
-/*void ExpandConvTransLayer::resetConvOutput(size_t batchSize, int inIdx) {
-  Matrix::resizeOrCreate(transOutValue_, batchSize * numFilters_, subN_[inIdx],
-                         false, useGpu_);
-}*/
-void ExpandConvTransLayer::expandOneFrame(MatrixPtr image, size_t startIdx,
-                                     int inIdx) {
-  resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
-  real *imgData = image->getData() + startIdx * image->getWidth();
-  MatrixPtr imageTmp = Matrix::create(
-      imgData, 1, imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel_, false,
-      useGpu_);
-  expandInput_->convExpand(*imageTmp, imgSizeH_[inIdx], imgSizeW_[inIdx],
-                           channel_, filterSize_[inIdx],
-                           filterSize_[inIdx], stride_[inIdx], stride_[inIdx],
-                           padding_[inIdx], padding_[inIdx],
-                           outputH_[inIdx], outputW_[inIdx]);
-  imageTmp->clear();
-}
-void ExpandConvTransLayer::expandBackOnce(MatrixPtr imageGrad, int inIdx,
-                                        int startIdx) {
-  int subM = subM_[inIdx];
-  int subN = subN_[inIdx];
-  int subK = subK_[inIdx];
-  LayerPtr prevLayer = getPrev(inIdx);
-  if (NULL == prevLayer->getOutputGrad()) {
-    return;
-  }
-  expandOneFrame(imageGrad, startIdx, inIdx);
-  real *outGradData =
-      prevLayer -> getOutputGrad()->getData()
-                  + startIdx * subN * numFilters_[inIdx];
-  real *wgtData = weights_[inIdx]->getW()->getData();
-  real *expInData = expandInput_->getData();
-  for (int g = 0; g < groups_[inIdx]; ++g) {
-    MatrixPtr A =
-        Matrix::create(wgtData, subK, subM, true, useGpu_);  // mark transpose
-    MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
-    MatrixPtr C = Matrix::create(outGradData, subM, subN, false, useGpu_);
-    C->mul(A, B, 1, 1);
-    A->clear();
-    B->clear();
-    C->clear();
-    wgtData += subK * subM;
-    expInData += subK * subN;
-    outGradData += subM * subN;
-  }
-}
 void ExpandConvTransLayer::forward(PassType passType) {
  Layer::forward(passType);
@@ -148,7 +76,7 @@ void ExpandConvTransLayer::forward(PassType passType) {
    LayerPtr prevLayer = getPrev(i);
    output = prevLayer->getOutputValue();
    REGISTER_TIMER_INFO("shrinkFwd", getName().c_str());
-    shrinkFwd(output, i);
+    bpropActs(output, getOutputValue(), i);
  }
  /* add the bias-vector */
@@ -164,84 +92,6 @@ void ExpandConvTransLayer::forward(PassType passType) {
  forwardActivation();
 }
-void ExpandConvTransLayer::shrinkFwd(MatrixPtr output, int inpIdx) {
-  int subM = subM_[inpIdx];
-  int subN = subN_[inpIdx];
-  int subK = subK_[inpIdx];
-  size_t batchSize = output->getHeight();
-  MatrixPtr image = getOutputValue();
-  /* reset the expand-grad memory */
-  resetExpandInput(subK * groups_[inpIdx], subN);
-  real *localData = output->getData();
-  real *imageData = image->getData();
-  for (size_t n = 0; n < batchSize; n++) {
-    real *wgtData = weights_[inpIdx]->getW()->getData();
-    real *expandInData = expandInput_->getData();
-    for (int g = 0; g < groups_[inpIdx]; g++) {
-      // create temporary matrix
-      MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(localData, subM, subN, false, useGpu_);
-      MatrixPtr A = Matrix::create(wgtData, subK, subM, false, useGpu_);
-      C->mul(A, B);  // mul
-      // clear the temporary matrix
-      A->clear();
-      B->clear();
-      C->clear();
-      expandInData += subK * subN;
-      localData += subM * subN;
-      wgtData += subK * subM;
-    }
-    // shrink one frame outGrad
-    MatrixPtr oneTmp = Matrix::create(
-        expandInput_->getData(), subK * groups_[inpIdx], subN, false, useGpu_);
-    MatrixPtr vTmp = Matrix::create(
-        imageData, 1,
-        imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel_, false,
-        useGpu_);
-    vTmp->convShrink(*oneTmp, imgSizeH_[inpIdx], imgSizeW_[inpIdx],
-                     channel_, filterSize_[inpIdx],
-                     filterSize_[inpIdx], stride_[inpIdx], stride_[inpIdx],
-                     padding_[inpIdx], padding_[inpIdx],
-                     outputH_[inpIdx], outputW_[inpIdx], 1.0f, 1.0f);
-    vTmp->clear();
-    oneTmp->clear();
-    // move the data-pointer
-    imageData += imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel_;
-  }
-}
-void ExpandConvTransLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
-  size_t mapW = getSize() / channel_;
-  size_t mapH = v->getElementCnt() / mapW;
-  MatrixPtr vTmp = Matrix::create(v->getData(), mapH, mapW, false, useGpu_);
-  Matrix::resizeOrCreate(transOutValue_, mapW, mapH, false, useGpu_);
-  vTmp->transpose(transOutValue_, false);  // false means no memory allocation
-  vTmp->reshape(transOutValue_->getElementCnt() / channel_, channel_);
-  biases->collectBias(*vTmp, 1.0f);
-}
-void ExpandConvTransLayer::bpropBiases(MatrixPtr v) {
-  MatrixPtr biases =
-      Matrix::create(biases_->getWGrad()->getData(), 1,
-                     biases_->getWGrad()->getElementCnt(), false, useGpu_);
-  if (sharedBiases_) {
-    bpropSharedBias(biases, v);
-  } else {
-    biases->collectBias(*v, 1.0f);
-  }
-  biases->clear();
-}
 void ExpandConvTransLayer::backward(const UpdateCallback &callback) {
  backwardActivation();
@@ -255,51 +105,18 @@ void ExpandConvTransLayer::backward(const UpdateCallback &callback) {
  for (size_t i = 0; i != inputLayers_.size(); ++i) {
    /* First, calculate the input layers error */
    for (size_t off = 0; off < imageGrad->getHeight(); off++) {
-        expandBackOnce(imageGrad, i, off);
+      if (NULL != getPrev(i)->getOutputGrad()) {
+        expandFwdOnce(imageGrad, getPrev(i)->getOutputGrad(), i, off);
+      }
    }
    if (weights_[i]->getWGrad()) {
      /* Then, calculate the W-gradient for the current layer */
-      bpropWeights(imageGrad, i);
+      bpropWeights(imageGrad, getPrev(i)->getOutputValue(), i);
      /* Increasing the number of gradient */
      weights_[i]->getParameterPtr()->incUpdate(callback);
    }
  }
 }
-void ExpandConvTransLayer::bpropWeights(MatrixPtr v, int inpIdx) {
-  MatrixPtr weightGrad = weights_[inpIdx]->getWGrad();
-  MatrixPtr outputV = getPrev(inpIdx)->getOutputValue();
-  int subM = subM_[inpIdx];
-  int subN = subN_[inpIdx];
-  int subK = subK_[inpIdx];
-  size_t batchSize = outputV->getHeight();
-  resetExpandInput(subK * groups_[inpIdx], subN);
-  real *outputData = outputV -> getData();
-  for (size_t n = 0; n < batchSize; n++) {  // frame by frame
-    // expand
-    expandOneFrame(v, n, inpIdx);
-    real *wGradData = weightGrad->getData();
-    real *expandInData = expandInput_->getData();
-    // expand-mul one-group by one
-    for (int g = 0; g < groups_[inpIdx]; g++) {
-      MatrixPtr A = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(outputData, subM, subN, true, useGpu_);
-      MatrixPtr C = Matrix::create(wGradData, subK, subM, false, useGpu_);
-      C->mul(A, B, 1, 1);
-      A->clear();
-      B->clear();
-      C->clear();
-      outputData += subM * subN;
-      wGradData += subK * subM;
-      expandInData += subK * subN;
-    }
-  }
-}
 }  // namespace paddle
--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ b/paddle/gserver/layers/ExpandConvTransLayer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
-#include "ConvBaseLayer.h"
+#include "ConvBaseLayerCpu.h"
 #include "paddle/math/Matrix.h"
 #include <vector>
@@ -24,32 +24,14 @@ namespace paddle {
 /**
 * @brief A subclass of convolution layer.
 * This layer expands input and use matrix multiplication to
- * calculate convolution operation.
+ * calculate convolution transpose (deconv) operation.
 *
 * The config file api is img_convTrans_layer.
 */
-class ExpandConvTransLayer : public ConvBaseLayer {
+class ExpandConvTransLayer : public ConvBaseLayerCpu {
-protected:
-  /// For expand convolution.
-  /// subM_ = numFilters_ / groups_.
-  IntV subM_;
-  /// subN_ = outputH_ * outputW_.
-  IntV subN_;
-  /// subK_ = channels_ * filterPixels_ * groups_.
-  IntV subK_;
-  /// The spatial dimensions of height of input feature map.
-  IntV imgSizeH_;
-  /// The spatial dimensions of width of input feature map.
-  IntV imgSizeW_;
-  /// The spatial dimensions of height of output feature map.
-  IntV outputH_;
-  /// The spatial dimensions of width of output feature map.
-  IntV outputW_;
 public:
  explicit ExpandConvTransLayer(const LayerConfig& config) :
-      ConvBaseLayer(config) {}
+    ConvBaseLayerCpu(config) {}
  ~ExpandConvTransLayer() {}
@@ -57,38 +39,8 @@ public:
  size_t getSize();
-  /**
-   * Create or resize expandInput_.
-   */
-  void resetExpandInput(size_t height, size_t width);
-  /**
-   * Create or resize transOutValue_.
-   */
-  void resetConvOutput(size_t batchSize, int inIdx);
-  /**
-   * Expand one input sample.
-   */
-  void expandOneFrame(MatrixPtr image, size_t startIdx, int inIdx);
-  /**
-   * Expand one output image and perform matrix multiplication.
-   */
-  void expandBackOnce(MatrixPtr image, int inIdx, int startIdx);
-  /**
-   * Perform matrix multiplication on one output and then shrink.
-   */
-  void shrinkFwd(MatrixPtr output, int inpIdx);
  void forward(PassType passType);
-  void bpropSharedBias(MatrixPtr biases, MatrixPtr v);
-  void bpropBiases(MatrixPtr v);
  void backward(const UpdateCallback& callback);
-  void bpropWeights(MatrixPtr v, int inpIdx);
-  void bpropActs(MatrixPtr v, int inpIdx);
 };
 }  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -302,6 +302,8 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
                              config.layerConfig.num_filters());
  testLayerGrad(config, "conv", 100, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "conv", 2, trans, useGpu, true, 0.02);
 }
 TEST(Layer, convLayer) {