From 02e04b44411a851a86217815e7d740c634d8324f Mon Sep 17 00:00:00 2001
From: xzl <>
Date: Tue, 18 Jul 2017 22:04:53 +0800
Subject: [PATCH] fuse the conv and depthwise conv together

 paddle/function/ConvOpTest.cpp | 281 ++++++++++++---------------------
 1 file changed, 104 insertions(+), 177 deletions(-)

diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp
index 61f0c18bed4..27609fbbd44 100644
--- a/paddle/function/ConvOpTest.cpp
+++ b/paddle/function/ConvOpTest.cpp
@@ -25,11 +25,17 @@ enum TestType {
   kBackwardFilterTest = 2,
+enum LayerType {
+  convolutionType = 0,
+  depthwiseConvolutionType = 1,
 template <DeviceType DType1, DeviceType DType2>
 class ConvolutionTest {
   ConvolutionTest(const std::string& conv1,
                   const std::string& conv2,
+                  LayerType layerType,
                   TestType type,
                   std::string algo = "auto") {
     for (size_t batchSize : {1, 32}) {
@@ -37,7 +43,17 @@ public:
         for (size_t filterSize : {1, 3, 5}) {
           for (size_t inputChannels : {3, 64}) {
             for (size_t outputChannels : {3, 64, 128}) {
-              if (inputChannels < outputChannels) break;
+              if (inputChannels > outputChannels) break;
+              if (layerType == depthwiseConvolutionType &&
+                  outputChannels % inputChannels != 0)
+                break;
+              size_t groups = 1;
+              if (layerType == depthwiseConvolutionType) {
+                groups = inputChannels;
+              }
               for (size_t stride : {1, 2}) {
                 for (size_t padding : {0, 1}) {
                   if (padding >= filterSize) break;
@@ -62,13 +78,24 @@ public:
                           .set("paddings", paddings)
                           .set("strides", strides)
-                          .set("groups", (size_t)1)
+                          .set("groups", groups)
                           .set("algo", algo));
                   TensorShape input{
                       batchSize, inputChannels, inputSize, inputSize};
-                  TensorShape filter{
-                      outputChannels, inputChannels, filterSize, filterSize};
+                  TensorShape filter;
+                  if (layerType == depthwiseConvolutionType)
+                    filter = TensorShape({groups,
+                                          outputChannels / groups,
+                                          (size_t)1,
+                                          filterSize,
+                                          filterSize});
+                  else
+                    filter = TensorShape({outputChannels,
+                                          inputChannels,
+                                          filterSize,
+                                          filterSize});
                   TensorShape output{
                       batchSize, outputChannels, outputSize, outputSize};
@@ -105,6 +132,7 @@ class ConvolutionTest2 {
   ConvolutionTest2(const std::string& conv1,
                    const std::string& conv2,
+                   LayerType layerType,
                    TestType type,
                    std::string algo = "auto") {
     for (size_t batchSize : {16}) {
@@ -113,7 +141,16 @@ public:
           for (size_t filterHeight : {1, 5}) {
             for (size_t filterWidth : {3, 7}) {
               for (size_t inputChannels : {7}) {
-                for (size_t outputChannels : {32}) {
+                for (size_t outputChannels : {7, 32}) {
+                  if (layerType == depthwiseConvolutionType &&
+                      outputChannels % inputChannels != 0)
+                    break;
+                  size_t groups = 1;
+                  if (layerType == depthwiseConvolutionType) {
+                    groups = inputChannels;
+                  }
                   size_t stride = 1;
                   size_t padding = 0;
                   size_t outputHeight =
@@ -141,13 +178,24 @@ public:
                           .set("paddings", paddings)
                           .set("strides", strides)
-                          .set("groups", (size_t)1)
+                          .set("groups", groups)
                           .set("algo", algo));
                   TensorShape input{
                       batchSize, inputChannels, inputHeight, inputWidth};
-                  TensorShape filter{
-                      outputChannels, inputChannels, filterHeight, filterWidth};
+                  TensorShape filter;
+                  if (layerType == depthwiseConvolutionType)
+                    filter = TensorShape({groups,
+                                          outputChannels / groups,
+                                          (size_t)1,
+                                          filterHeight,
+                                          filterWidth});
+                  else
+                    filter = TensorShape({outputChannels,
+                                          inputChannels,
+                                          filterHeight,
+                                          filterWidth});
                   TensorShape output{
                       batchSize, outputChannels, outputHeight, outputWidth};
@@ -177,183 +225,46 @@ public:
-template <DeviceType DType1, DeviceType DType2>
-class DepthwiseConvolutionTest {
-  DepthwiseConvolutionTest(const std::string& conv1,
-                           const std::string& conv2,
-                           TestType type,
-                           std::string algo = "auto") {
-    for (size_t batchSize : {1, 32}) {
-      for (size_t inputSize : {7, 14, 54}) {
-        for (size_t filterSize : {1, 3, 5}) {
-          for (size_t inputChannels : {64, 128}) {
-            size_t outputChannels = inputChannels;
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                if (padding >= filterSize) break;
-                size_t outputSize =
-                    (inputSize - filterSize + 2 * padding + stride) / stride;
-                VLOG(3) << " batchSize=" << batchSize
-                        << " inputChannels=" << inputChannels
-                        << " inputHeight=" << inputSize
-                        << " inputWidth=" << inputSize
-                        << " outputChannels=" << outputChannels
-                        << " filterHeight=" << filterSize
-                        << " filterWidth=" << filterSize
-                        << " outputHeight=" << outputSize
-                        << " outputWidth=" << outputSize << " stride=" << stride
-                        << " padding=" << padding;
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> strides = {stride, stride};
-                size_t groups = inputChannels;
-                Compare2Function<DType1, DType2> test(
-                    conv1,
-                    conv2,
-                    FuncConfig()
-                        .set("paddings", paddings)
-                        .set("strides", strides)
-                        .set("groups", groups)
-                        .set("algo", algo));
-                TensorShape input{
-                    batchSize, inputChannels, inputSize, inputSize};
-                TensorShape filter{inputChannels, 1, 1, filterSize, filterSize};
-                TensorShape output{
-                    batchSize, outputChannels, outputSize, outputSize};
-                if (type == kForwardTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-        ;
-                } else if (type == kBackwardInputTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-        ;
-                } else if (type == kBackwardFilterTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-        ;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-// Mainly used to test cases where the height and width (input, filter)
-// are not equal.
-template <DeviceType DType1, DeviceType DType2>
-class DepthwiseConvolutionTest2 {
-  DepthwiseConvolutionTest2(const std::string& conv1,
-                            const std::string& conv2,
-                            TestType type,
-                            std::string algo = "auto") {
-    for (size_t batchSize : {16}) {
-      for (size_t inputHeight : {7, 31}) {
-        for (size_t inputWidth : {10, 54}) {
-          for (size_t filterHeight : {1, 5}) {
-            for (size_t filterWidth : {3, 7}) {
-              for (size_t inputChannels : {32}) {
-                size_t outputChannels = inputChannels;
-                size_t stride = 1;
-                size_t padding = 0;
-                size_t outputHeight =
-                    (inputHeight - filterHeight + 2 * padding + stride) /
-                    stride;
-                size_t outputWidth =
-                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
-                VLOG(3) << " batchSize=" << batchSize
-                        << " inputChannels=" << inputChannels
-                        << " inputHeight=" << inputHeight
-                        << " inputWidth=" << inputWidth
-                        << " outputChannels=" << outputChannels
-                        << " filterHeight=" << filterHeight
-                        << " filterWidth=" << filterWidth
-                        << " outputHeight=" << outputHeight
-                        << " outputWidth=" << outputWidth
-                        << " stride=" << stride << " padding=" << padding;
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> strides = {stride, stride};
-                size_t groups = inputChannels;
-                Compare2Function<DType1, DType2> test(
-                    conv1,
-                    conv2,
-                    FuncConfig()
-                        .set("paddings", paddings)
-                        .set("strides", strides)
-                        .set("groups", groups)
-                        .set("algo", algo));
-                TensorShape input{
-                    batchSize, inputChannels, inputHeight, inputWidth};
-                TensorShape filter{
-                    inputChannels, 1, 1, filterHeight, filterWidth};
-                TensorShape output{
-                    batchSize, outputChannels, outputHeight, outputWidth};
-                if (type == kForwardTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-        ;
-                } else if (type == kBackwardInputTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-        ;
-                } else if (type == kBackwardFilterTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-        ;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
 // ======Start Convolution TEST======
 TEST(Forward, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+      "NaiveConv-CPU", "GemmConv-CPU", convolutionType, kForwardTest);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test2(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+      "NaiveConv-CPU", "GemmConv-CPU", convolutionType, kForwardTest);
 TEST(Forward, GEMM2) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+      "GemmConv-CPU", "GemmConv-GPU", convolutionType, kForwardTest);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+      "GemmConv-CPU", "GemmConv-GPU", convolutionType, kForwardTest);
 TEST(BackwardInput, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+      "GemmConvGradInput-CPU",
+      "GemmConvGradInput-GPU",
+      convolutionType,
+      kBackwardInputTest);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+      "GemmConvGradInput-CPU",
+      "GemmConvGradInput-GPU",
+      convolutionType,
+      kBackwardInputTest);
 TEST(BackwardFilter, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+      "GemmConvGradFilter-CPU",
+      "GemmConvGradFilter-GPU",
+      convolutionType,
+      kBackwardFilterTest);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+      "GemmConvGradFilter-CPU",
+      "GemmConvGradFilter-GPU",
+      convolutionType,
+      kBackwardFilterTest);
 // ======End Convolution TEST======
@@ -364,38 +275,54 @@ TEST(BackwardFilter, GEMM) {
 TEST(DepthwiseConvForward, GEMM) {
-  DepthwiseConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-GPU", "DepthwiseConv-GPU", kForwardTest);
-  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-GPU", "DepthwiseConv-GPU", kForwardTest);
+  ConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
+      "GemmConv-GPU",
+      "DepthwiseConv-GPU",
+      depthwiseConvolutionType,
+      kForwardTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConv-GPU",
+      "DepthwiseConv-GPU",
+      depthwiseConvolutionType,
+      kForwardTest);
 TEST(DepthwiseConvForward, GEMM2) {
-  DepthwiseConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
-      "DepthwiseConv-GPU", "DepthwiseConv-GPU", kForwardTest);
-  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "DepthwiseConv-GPU", "DepthwiseConv-GPU", kForwardTest);
+  ConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
+      "DepthwiseConv-GPU",
+      "DepthwiseConv-GPU",
+      depthwiseConvolutionType,
+      kForwardTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "DepthwiseConv-GPU",
+      "DepthwiseConv-GPU",
+      depthwiseConvolutionType,
+      kForwardTest);
 TEST(DepthwiseConvBackwardInput, GEMM) {
-  DepthwiseConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      depthwiseConvolutionType,
-  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      depthwiseConvolutionType,
 TEST(DepthwiseConvBackwardFilter, GEMM) {
-  DepthwiseConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      depthwiseConvolutionType,
-  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      depthwiseConvolutionType,