Add Im2ColMobileFunctor.

d775895e · hedaoyuan · dbf1d75f · d775895e · d775895e
隐藏空白更改
内联并排

Showing with 76 addition and 28 deletion

paddle/function/GemmConvOp.cpp paddle/function/GemmConvOp.cpp +28 -28

paddle/function/Im2Col.h paddle/function/Im2Col.h +48 -0

未找到文件。
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -206,8 +206,7 @@ public:
      colData = reinterpret_cast<real*>(memory_->getBuf());
    }

-    Im2ColFunctor<kCFO, Device, real> im2col;
-    GemmFunctor<Device, real> gemm;
+    Im2ColMobileFunctor<real> im2col;
    size_t inputOffset = imShape.getElements();
    size_t outputOffset =
        (outputChannels / groups_) * outputHeight * outputWidth;
@@ -241,19 +240,20 @@ public:

              // gemm
              int M = outputChannels / groups_;
-              gemm(CblasNoTrans,
-                   CblasNoTrans,
-                   M,
-                   N,
-                   K,
-                   1.0f,
-                   filterData + g * filterOffset + colHeightStart,
-                   kStride,
-                   colData,
-                   N,
-                   beta_,
-                   outputData + g * outputOffset + colWidthStart,
-                   nStride);
+              BlasGemm<Device, real>::compute(
+                  false,
+                  false,
+                  M,
+                  N,
+                  K,
+                  1.0f,
+                  filterData + g * filterOffset + colHeightStart,
+                  kStride,
+                  colData,
+                  N,
+                  beta_,
+                  outputData + g * outputOffset + colWidthStart,
+                  nStride);
            }
            beta_ = 1.0;
          }
@@ -261,19 +261,19 @@ public:
          int M = outputChannels / groups_;
          int N = outputHeight * outputWidth;
          int K = inputChannels / groups_ * filterHeight * filterWidth;
-          gemm(CblasNoTrans,
-               CblasNoTrans,
-               M,
-               N,
-               K,
-               1.0f,
-               filterData + g * filterOffset,
-               K,
-               inputData + g * inputOffset,
-               N,
-               beta,
-               outputData + g * outputOffset,
-               N);
+          BlasGemm<Device, real>::compute(false,
+                                          false,
+                                          M,
+                                          N,
+                                          K,
+                                          1.0f,
+                                          filterData + g * filterOffset,
+                                          K,
+                                          inputData + g * inputOffset,
+                                          N,
+                                          beta,
+                                          outputData + g * outputOffset,
+                                          N);
        }
      }
      inputData += inputChannels * inputHeight * inputWidth;

--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -98,4 +98,52 @@ public:
                  int dilationWidth = 1);
 };

+template <class T>
+class Im2ColMobileFunctor {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int colHeightStart,
+                  int colHeightSize,
+                  int colWidthStart,
+                  int colWidthSize) {
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputWidth = colShape[4];
+
+    for (int colh = 0; colh < colHeightSize; colh++) {
+      int wOffset = (colHeightStart + colh) % filterWidth;
+      int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
+      int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
+
+      for (int colw = 0; colw < colWidthSize; colw++) {
+        int h = (colWidthStart + colw) / outputWidth;
+        int w = (colWidthStart + colw) % outputWidth;
+
+        int imRowIdx = h * strideHeight + hOffset;
+        int imColIdx = w * strideWidth + wOffset;
+        if ((imRowIdx - paddingHeight) < 0 ||
+            (imRowIdx - paddingHeight) >= inputHeight ||
+            (imColIdx - paddingWidth) < 0 ||
+            (imColIdx - paddingWidth) >= inputWidth) {
+          colData[colh * colWidthSize + colw] = T(0);
+        } else {
+          imRowIdx += c_im * inputHeight - paddingHeight;
+          imColIdx -= paddingWidth;
+          colData[colh * colWidthSize + colw] =
+              imData[imRowIdx * inputWidth + imColIdx];
+        }
+      }
+    }
+  }
+};
+
 }  // namespace paddle