Merge pull request #3163 from NHZlX/fix_conv_1x1

ignore im2col if not necessary in conv 1 * 1

Merge pull request #3163 from NHZlX/fix_conv_1x1
ignore im2col if not necessary in conv 1 * 1
94cee3d6 · Zhaolong Xing · GitHub · 28db1491 · fa10677a · 94cee3d6
隐藏空白更改
内联并排

Showing with 90 addition and 48 deletion

paddle/function/ConvOp.h paddle/function/ConvOp.h +7 -0

paddle/function/GemmConvOp.cpp paddle/function/GemmConvOp.cpp +83 -48

未找到文件。
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@@ -109,6 +109,13 @@ protected:
    return filter[filter.ndims() - 1];
  }
+  // determine whether im2col needs to be performed
+  inline bool isNeedIm2col(const TensorShape& filter) const {
+    return !(getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 &&
+             strideH() == 1 && strideW() == 1 && paddingH() == 0 &&
+             paddingW() == 0);
+  }
  std::vector<size_t> strides_;
  std::vector<size_t> paddings_;

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -66,16 +66,23 @@ public:
    real* inputData = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
-                                        filterHeight,
-                                        filterWidth,
-                                        outputHeight,
-                                        outputWidth});
-    resizeBuffer<Device>(colShape.getElements());
+    TensorShape colShape;
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    real* colData = NULL;
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
    Im2ColFunctor<kCFO, Device, real> im2col;
    GemmFunctor<Device, real> gemm;
@@ -86,15 +93,18 @@ public:
    for (size_t i = 0; i < batchSize; i++) {
      for (size_t g = 0; g < groups_; g++) {
-        im2col(inputData + g * inputOffset,
+        if (needIm2col) {
-               imShape,
+          im2col(inputData + g * inputOffset,
-               colData,
+                 imShape,
-               colShape,
+                 colData,
-               strideH(),
+                 colShape,
-               strideW(),
+                 strideH(),
-               paddingH(),
+                 strideW(),
-               paddingW());
+                 paddingH(),
+                 paddingW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
        int M = outputChannels / groups_;
        int N = outputHeight * outputWidth;
        int K = inputChannels / groups_ * filterHeight * filterWidth;
@@ -159,19 +169,27 @@ public:
    real* outputGrad = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* inputGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
-                                        filterHeight,
-                                        filterWidth,
-                                        outputHeight,
-                                        outputWidth});
-    resizeBuffer<Device>(colShape.getElements());
+    TensorShape colShape;
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    real* colData = NULL;
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
    Col2ImFunctor<kCFO, Device, real> col2im;
    GemmFunctor<Device, real> gemm;
    size_t inputOffset = imShape.getElements();
    size_t outputOffset =
        (outputChannels / groups_) * outputHeight * outputWidth;
@@ -182,6 +200,11 @@ public:
        int K = outputChannels / groups_;
        int N = outputHeight * outputWidth;
        int M = inputChannels / groups_ * filterHeight * filterWidth;
+        real scale = 0.0f;
+        if (!needIm2col) {
+          colData = inputGrad + g * inputOffset;
+          scale = 1.0f;
+        }
        gemm(CblasTrans,
             CblasNoTrans,
             M,
@@ -192,17 +215,19 @@ public:
             M,
             outputGrad + g * outputOffset,
             N,
-             0.0f,
+             scale,
             colData,
             N);
-        col2im(inputGrad + g * inputOffset,
+        if (needIm2col) {
-               imShape,
+          col2im(inputGrad + g * inputOffset,
-               colData,
+                 imShape,
-               colShape,
+                 colData,
-               strideH(),
+                 colShape,
-               strideW(),
+                 strideH(),
-               paddingH(),
+                 strideW(),
-               paddingW());
+                 paddingH(),
+                 paddingW());
+        }
      }
      inputGrad += inputChannels * inputHeight * inputWidth;
      outputGrad += outputChannels * outputHeight * outputWidth;
@@ -255,16 +280,23 @@ public:
    real* outputGrad = inputs[0].data<real>();
    real* inputData = inputs[1].data<real>();
    real* filterGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
-                                        filterHeight,
-                                        filterWidth,
-                                        outputHeight,
-                                        outputWidth});
-    resizeBuffer<Device>(colShape.getElements());
+    TensorShape colShape;
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    real* colData = NULL;
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
    Im2ColFunctor<kCFO, Device, real> im2col;
    GemmFunctor<Device, real> gemm;
@@ -274,15 +306,18 @@ public:
    size_t filterOffset = filter.getElements() / groups_;
    for (size_t i = 0; i < batchSize; i++) {
      for (size_t g = 0; g < groups_; g++) {
-        im2col(inputData + g * inputOffset,
+        if (needIm2col) {
-               imShape,
+          im2col(inputData + g * inputOffset,
-               colData,
+                 imShape,
-               colShape,
+                 colData,
-               strideH(),
+                 colShape,
-               strideW(),
+                 strideH(),
-               paddingH(),
+                 strideW(),
-               paddingW());
+                 paddingH(),
+                 paddingW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
        int M = outputChannels / groups_;
        int K = outputHeight * outputWidth;
        int N = inputChannels / groups_ * filterHeight * filterWidth;