Merge pull request #3163 from NHZlX/fix_conv_1x1

ignore im2col if not necessary in conv 1 * 1

Merge pull request #3163 from NHZlX/fix_conv_1x1
ignore im2col if not necessary in conv 1 * 1
94cee3d6 · Zhaolong Xing · GitHub · 28db1491 · fa10677a · 94cee3d6
显示空白变更内容
内联并排

Showing with 90 addition and 48 deletion

paddle/function/ConvOp.h paddle/function/ConvOp.h +7 -0

paddle/function/GemmConvOp.cpp paddle/function/GemmConvOp.cpp +83 -48

未找到文件。
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@@ -109,6 +109,13 @@ protected:
    return filter[filter.ndims() - 1];
  }

+  // determine whether im2col needs to be performed
+  inline bool isNeedIm2col(const TensorShape& filter) const {
+    return !(getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 &&
+             strideH() == 1 && strideW() == 1 && paddingH() == 0 &&
+             paddingW() == 0);
+  }
+
  std::vector<size_t> strides_;
  std::vector<size_t> paddings_;


--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -66,16 +66,23 @@ public:
    real* inputData = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
                              filterHeight,
                              filterWidth,
                              outputHeight,
                              outputWidth});
-
      resizeBuffer<Device>(colShape.getElements());
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }

    Im2ColFunctor<kCFO, Device, real> im2col;
    GemmFunctor<Device, real> gemm;
@@ -86,6 +93,7 @@ public:

    for (size_t i = 0; i < batchSize; i++) {
      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
          im2col(inputData + g * inputOffset,
                 imShape,
                 colData,
@@ -94,7 +102,9 @@ public:
                 strideW(),
                 paddingH(),
                 paddingW());
-
+        } else {
+          colData = inputData + g * inputOffset;
+        }
        int M = outputChannels / groups_;
        int N = outputHeight * outputWidth;
        int K = inputChannels / groups_ * filterHeight * filterWidth;
@@ -159,19 +169,27 @@ public:
    real* outputGrad = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* inputGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
                              filterHeight,
                              filterWidth,
                              outputHeight,
                              outputWidth});
-
      resizeBuffer<Device>(colShape.getElements());
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }

    Col2ImFunctor<kCFO, Device, real> col2im;
    GemmFunctor<Device, real> gemm;
+
    size_t inputOffset = imShape.getElements();
    size_t outputOffset =
        (outputChannels / groups_) * outputHeight * outputWidth;
@@ -182,6 +200,11 @@ public:
        int K = outputChannels / groups_;
        int N = outputHeight * outputWidth;
        int M = inputChannels / groups_ * filterHeight * filterWidth;
+        real scale = 0.0f;
+        if (!needIm2col) {
+          colData = inputGrad + g * inputOffset;
+          scale = 1.0f;
+        }
        gemm(CblasTrans,
             CblasNoTrans,
             M,
@@ -192,9 +215,10 @@ public:
             M,
             outputGrad + g * outputOffset,
             N,
-             0.0f,
+             scale,
             colData,
             N);
+        if (needIm2col) {
          col2im(inputGrad + g * inputOffset,
                 imShape,
                 colData,
@@ -204,6 +228,7 @@ public:
                 paddingH(),
                 paddingW());
        }
+      }
      inputGrad += inputChannels * inputHeight * inputWidth;
      outputGrad += outputChannels * outputHeight * outputWidth;
    }
@@ -255,16 +280,23 @@ public:
    real* outputGrad = inputs[0].data<real>();
    real* inputData = inputs[1].data<real>();
    real* filterGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
                              filterHeight,
                              filterWidth,
                              outputHeight,
                              outputWidth});
-
      resizeBuffer<Device>(colShape.getElements());
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }

    Im2ColFunctor<kCFO, Device, real> im2col;
    GemmFunctor<Device, real> gemm;
@@ -274,6 +306,7 @@ public:
    size_t filterOffset = filter.getElements() / groups_;
    for (size_t i = 0; i < batchSize; i++) {
      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
          im2col(inputData + g * inputOffset,
                 imShape,
                 colData,
@@ -282,7 +315,9 @@ public:
                 strideW(),
                 paddingH(),
                 paddingW());
-
+        } else {
+          colData = inputData + g * inputOffset;
+        }
        int M = outputChannels / groups_;
        int K = outputHeight * outputWidth;
        int N = inputChannels / groups_ * filterHeight * filterWidth;