ignore im2col if not necessary in conv 1 * 1

5229df52 · xzl · f70e8077 · 5229df52 · 5229df52
隐藏空白更改
内联并排

Showing with 94 addition and 51 deletion

paddle/function/ConvOp.h paddle/function/ConvOp.h +7 -0

paddle/function/GemmConvOp.cpp paddle/function/GemmConvOp.cpp +87 -51

未找到文件。
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@@ -109,6 +109,13 @@ protected:
    return filter[filter.ndims() - 1];
  }
+  // determine whether im2col needs to be performed
+  inline bool isSkipIm2col(const TensorShape& filter) const {
+    return (getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 &&
+            strideH() == 1 && strideW() == 1 && paddingH() == 0 &&
+            paddingW() == 0);
+  }
  std::vector<size_t> strides_;
  std::vector<size_t> paddings_;

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -66,16 +66,23 @@ public:
    real* inputData = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* outputData = outputs[0].data<real>();
+    bool skipIm2col = isSkipIm2col(filter);
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
-                                        filterHeight,
-                                        filterWidth,
-                                        outputHeight,
-                                        outputWidth});
-    resizeBuffer<Device>(colShape.getElements());
+    TensorShape colShape;
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    real *colBuffer, *colData = NULL;
+    if (!skipIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
    Im2ColFunctor<kCFO, Device, real> im2col;
    GemmFunctor<Device, real> gemm;
@@ -86,15 +93,18 @@ public:
    for (size_t i = 0; i < batchSize; i++) {
      for (size_t g = 0; g < groups_; g++) {
-        im2col(inputData + g * inputOffset,
+        colBuffer = inputData + g * inputOffset;
-               imShape,
+        if (!skipIm2col) {
-               colData,
+          im2col(inputData + g * inputOffset,
-               colShape,
+                 imShape,
-               strideH(),
+                 colData,
-               strideW(),
+                 colShape,
-               paddingH(),
+                 strideH(),
-               paddingW());
+                 strideW(),
+                 paddingH(),
+                 paddingW());
+          colBuffer = colData;
+        }
        int M = outputChannels / groups_;
        int N = outputHeight * outputWidth;
        int K = inputChannels / groups_ * filterHeight * filterWidth;
@@ -106,7 +116,7 @@ public:
             1.0f,
             filterData + g * filterOffset,
             K,
-             colData,
+             colBuffer,
             N,
             beta,
             outputData + g * outputOffset,
@@ -159,19 +169,27 @@ public:
    real* outputGrad = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* inputGrad = outputs[0].data<real>();
+    bool skipIm2col = isSkipIm2col(filter);
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
-                                        filterHeight,
-                                        filterWidth,
-                                        outputHeight,
-                                        outputWidth});
-    resizeBuffer<Device>(colShape.getElements());
+    TensorShape colShape;
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    real *colBuffer, *colData = NULL;
+    if (!skipIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
    Col2ImFunctor<kCFO, Device, real> col2im;
    GemmFunctor<Device, real> gemm;
    size_t inputOffset = imShape.getElements();
    size_t outputOffset =
        (outputChannels / groups_) * outputHeight * outputWidth;
@@ -182,6 +200,12 @@ public:
        int K = outputChannels / groups_;
        int N = outputHeight * outputWidth;
        int M = inputChannels / groups_ * filterHeight * filterWidth;
+        colBuffer = colData;
+        real scale = 0.0f;
+        if (skipIm2col) {
+          colBuffer = inputGrad + g * inputOffset;
+          scale = 1.0f;
+        }
        gemm(CblasTrans,
             CblasNoTrans,
             M,
@@ -192,17 +216,19 @@ public:
             M,
             outputGrad + g * outputOffset,
             N,
-             0.0f,
+             scale,
-             colData,
+             colBuffer,
             N);
-        col2im(inputGrad + g * inputOffset,
+        if (!skipIm2col) {
-               imShape,
+          col2im(inputGrad + g * inputOffset,
-               colData,
+                 imShape,
-               colShape,
+                 colBuffer,
-               strideH(),
+                 colShape,
-               strideW(),
+                 strideH(),
-               paddingH(),
+                 strideW(),
-               paddingW());
+                 paddingH(),
+                 paddingW());
+        }
      }
      inputGrad += inputChannels * inputHeight * inputWidth;
      outputGrad += outputChannels * outputHeight * outputWidth;
@@ -255,16 +281,23 @@ public:
    real* outputGrad = inputs[0].data<real>();
    real* inputData = inputs[1].data<real>();
    real* filterGrad = outputs[0].data<real>();
+    bool skipIm2col = isSkipIm2col(filter);
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
-                                        filterHeight,
-                                        filterWidth,
-                                        outputHeight,
-                                        outputWidth});
-    resizeBuffer<Device>(colShape.getElements());
+    TensorShape colShape;
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    real *colBuffer, *colData = NULL;
+    if (!skipIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
    Im2ColFunctor<kCFO, Device, real> im2col;
    GemmFunctor<Device, real> gemm;
@@ -274,15 +307,18 @@ public:
    size_t filterOffset = filter.getElements() / groups_;
    for (size_t i = 0; i < batchSize; i++) {
      for (size_t g = 0; g < groups_; g++) {
-        im2col(inputData + g * inputOffset,
+        colBuffer = inputData + g * inputOffset;
-               imShape,
+        if (!skipIm2col) {
-               colData,
+          im2col(inputData + g * inputOffset,
-               colShape,
+                 imShape,
-               strideH(),
+                 colData,
-               strideW(),
+                 colShape,
-               paddingH(),
+                 strideH(),
-               paddingW());
+                 strideW(),
+                 paddingH(),
+                 paddingW());
+          colBuffer = colData;
+        }
        int M = outputChannels / groups_;
        int K = outputHeight * outputWidth;
        int N = inputChannels / groups_ * filterHeight * filterWidth;
@@ -294,7 +330,7 @@ public:
             1.0f,
             outputGrad + g * outputOffset,
             K,
-             colData,
+             colBuffer,
             K,
             i == 0 ? beta : 1.0f,
             filterGrad + g * filterOffset,