Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_slice_to_pten

87099d12 · phlrain · 75aca6d4 · b4d3597a · 87099d12 · 87099d12
12 changed file
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
@@ -157,7 +157,7 @@ ConvActivationFusePass::ConvActivationFusePass() {
      // IsStringIn({"NHWC", "NCHW"}) MobileNetV2 has no this attribute
      .AddAttr("data_format")
      .IsOptional()
-      .IsStringIn({"NCHW", "AnyLayout"})
+      .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
      .End();

  AddOpCompat(OpCompat("relu"))

--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -115,7 +115,7 @@ Conv2DTransposeBiasFusePass::Conv2DTransposeBiasFusePass() {
      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
      .End()
      .AddAttr("data_format")
-      .IsStringIn({"NCHW"})
+      .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
      .End();

  AddOpCompat(OpCompat("elementwise_add"))
@@ -129,7 +129,7 @@ Conv2DTransposeBiasFusePass::Conv2DTransposeBiasFusePass() {
      .IsTensor()
      .End()
      .AddAttr("axis")
-      .IsIntIn({1})
+      .IsIntIn({1, 3})
      .End();
 }


--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
@@ -59,7 +59,7 @@ ConvConcatReLUFusePass::ConvConcatReLUFusePass() {
      .IsType<std::vector<int>>()
      .End()
      .AddAttr("data_format")
-      .IsStringIn({"NCHW"})
+      .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
      .End();

  AddOpCompat(OpCompat("concat"))

--- a/paddle/fluid/operators/distribution_helper.h
+++ b/paddle/fluid/operators/distribution_helper.h
@@ -28,6 +28,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/pten/core/hostdevice.h"

+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/pten/kernels/primitive/kernel_primitives.h"
+#endif
+
 #if !defined(_WIN32)
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
 #else
@@ -91,6 +95,8 @@ struct normal_transform {

 #if defined(__NVCC__) || defined(__HIPCC__)

+namespace kps = pten::kps;
+
 /*********************** Distribution Function *************************/
 template <typename T>
 struct uniform_distribution;
@@ -176,25 +182,26 @@ template <typename T, typename DistOp, typename TransformOp>
 __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
                                   DistOp dist, TransformOp trans,
                                   T *out_data) {
-  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
-  int32_t returns_count = DistOp::kReturnsCount;
+  size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
+  static constexpr int kCount = DistOp::kReturnsCount;
 #if defined(__NVCC__)
  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx, offset, &state);
+  curand_init(seed, idx + THREAD_ID_X, offset, &state);
+  using SType = curandStatePhilox4_32_10_t;
 #else
  hiprandStatePhilox4_32_10_t state;
-  hiprand_init(seed, idx, offset, &state);
+  hiprand_init(seed, idx + THREAD_ID_X, offset, &state);
+  using SType = hiprandStatePhilox4_32_10_t;
 #endif
-  size_t total_thread = gridDim.x * blockDim.x;
-  for (size_t i = idx; i < size; i += total_thread * returns_count) {
-    auto random_tuple = dist(&state);
-    for (size_t j = 0; j < returns_count; j++) {
-      size_t index = i + j * total_thread;
-      if (index < size) {
-        auto random = (&random_tuple.x)[j];
-        out_data[index] = static_cast<T>(trans(random));
-      }
-    }
+  size_t total_thread = GRID_NUM_X * BLOCK_NUM_X;
+  T args[kCount];
+  T result[kCount];
+  for (size_t i = idx; i < size; i += total_thread * kCount) {
+    kps::ElementwiseRandom<SType, T, kCount, 1, DistOp>(&args[0], dist, &state);
+    kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(&result[0], &args[0],
+                                                           trans);
+    kps::WriteData<T, T, kCount, 1, 1, true>(out_data + i, &result[0], size - i,
+                                             1, total_thread, 1);
  }
 }


--- a/paddle/pten/kernels/primitive/compute_primitives.h
+++ b/paddle/pten/kernels/primitive/compute_primitives.h
@@ -428,5 +428,58 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
  }
 }

+template <typename StateType,
+          typename OutT,
+          int ReturnsCount,
+          int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseRandom(OutT* out,
+                                                  OpFunc compute,
+                                                  StateType* state) {
+  auto random_tuple = compute(state);
+#pragma unroll
+  for (int i = 0; i < ReturnsCount; i++) {
+    out[i] = static_cast<OutT>((&random_tuple.x)[i]);
+  }
+}
+
+// attention please set share_size = blockDim.x;
+// data and b are the register pointer
+#define shared_size 64
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void Cumsum(OutT* out,
+                                       const InT* in,
+                                       OpFunc compute) {
+  __shared__ InT temp[shared_size * 2 + (shared_size * 2) / 32];
+  int tidx = threadIdx.x;
+  temp[tidx + tidx / 32] = in[0];
+  temp[shared_size + tidx + (shared_size + tidx) / 32] = in[1];
+  for (int stride = 1; stride <= blockDim.x; stride *= 2) {
+    __syncthreads();
+    int index = (tidx + 1) * 2 * stride - 1;
+    if (index < (blockDim.x * 2)) {
+      temp[index + index / 32] += temp[index - stride + (index - stride) / 32];
+    }
+  }
+  for (int stride = (blockDim.x * 2) / 4; stride > 0; stride /= 2) {
+    __syncthreads();
+    int index = (tidx + 1) * 2 * stride - 1;
+    if ((index + stride) < (blockDim.x * 2)) {
+      temp[index + stride + (stride + index) / 32] +=
+          temp[index + (index) / 32];
+    }
+  }
+
+  __syncthreads();
+  out[0] = static_cast<OutT>(temp[tidx + tidx / 32]);
+  out[1] =
+      static_cast<OutT>(temp[tidx + shared_size + (tidx + shared_size) / 32]);
+}
+
 }  // namespace kps
 }  // namespace pten
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py
@@ -53,8 +53,6 @@ class TestConvActMkldnnFusePass(PassAutoScanTest):
        data_format = prog_config.ops[0].attrs["data_format"]
        filter_shape = prog_config.weights["filter"].shape
        input_shape = prog_config.inputs["input_x"].shape
-        if data_format != "NCHW":
-            return False
        if padding_algorithm == "VALID":
            if ((input_shape[2] - (dilations[0] * (filter_shape[2] - 1) + 1)) / strides[0] + 1) <= 1 or \
            ((input_shape[3] - (dilations[1] * (filter_shape[3] - 1) + 1)) / strides[1] + 1) <= 1:
@@ -80,8 +78,8 @@ class TestConvActMkldnnFusePass(PassAutoScanTest):
        x_shape = draw(
            st.lists(
                st.integers(
-                    min_value=1, max_value=100), min_size=4, max_size=4))
-        x_shape[1] = draw(st.integers(min_value=1, max_value=10))
+                    min_value=5, max_value=100), min_size=4, max_size=4))
+        x_shape[1] = draw(st.integers(min_value=5, max_value=10))

        # 2. Generate legal attr:data_format of conv2d
        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
@@ -90,7 +88,7 @@ class TestConvActMkldnnFusePass(PassAutoScanTest):
        f_shape = draw(
            st.lists(
                st.integers(
-                    min_value=1, max_value=7), min_size=4, max_size=4))
+                    min_value=1, max_value=5), min_size=4, max_size=4))
        if data_format == "NCHW":
            f_shape[1] = x_shape[1]
        else:

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
@@ -53,8 +53,6 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
        data_format = prog_config.ops[0].attrs["data_format"]
        filter_shape = prog_config.weights["filter"].shape
        input_shape = prog_config.inputs["input_x"].shape
-        if data_format != "NCHW":
-            return False
        if padding_algorithm == "VALID":
            if ((input_shape[2] - (dilations[0] * (filter_shape[2] - 1) + 1)) / strides[0] + 1) <= 1 or \
            ((input_shape[3] - (dilations[1] * (filter_shape[3] - 1) + 1)) / strides[1] + 1) <= 1:
@@ -80,8 +78,8 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
        x_shape = draw(
            st.lists(
                st.integers(
-                    min_value=1, max_value=100), min_size=4, max_size=4))
-        x_shape[1] = draw(st.integers(min_value=1, max_value=10))
+                    min_value=5, max_value=100), min_size=4, max_size=4))
+        x_shape[1] = draw(st.integers(min_value=5, max_value=10))

        # 2. Generate legal attr:data_format of conv2d
        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
@@ -90,7 +88,7 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
        f_shape = draw(
            st.lists(
                st.integers(
-                    min_value=1, max_value=7), min_size=4, max_size=4))
+                    min_value=1, max_value=4), min_size=4, max_size=4))
        if data_format == "NCHW":
            f_shape[1] = x_shape[1]
        else:
@@ -100,7 +98,7 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
        strides = draw(
            st.lists(
                st.integers(
-                    min_value=1, max_value=5), min_size=2, max_size=2))
+                    min_value=1, max_value=4), min_size=2, max_size=2))

        # 5. Generate legal attr:padding_algorithm of conv2d
        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
@@ -109,7 +107,7 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
        padding = draw(
            st.lists(
                st.integers(
-                    min_value=1, max_value=5), min_size=4, max_size=4))
+                    min_value=1, max_value=4), min_size=4, max_size=4))

        # 7. Generate legal attr:groups of conv2d
        groups = draw(st.integers(min_value=1, max_value=3))
@@ -118,7 +116,7 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
        dilations = draw(
            st.lists(
                st.integers(
-                    min_value=1, max_value=5), min_size=2, max_size=2))
+                    min_value=1, max_value=4), min_size=2, max_size=2))

        # 9. Generate legal shape of input:bias of elementwise_add
        bias_shape = [f_shape[0]]

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
@@ -27,15 +27,6 @@ import hypothesis.strategies as st

 class TestConvConcatReluMkldnnFusePass(PassAutoScanTest):
    def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
-        ]
-        # If the problem has been fixed, the judgment 
-        # needs to be deleted!!!
-        if attrs[0]['data_format'] == "NHWC":
-            return False
-
        return True

    def sample_program_config(self, draw):

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
@@ -27,15 +27,6 @@ import hypothesis.strategies as st

 class TestConvGeluMkldnnFusePass(PassAutoScanTest):
    def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
-        ]
-        # If the problem has been fixed, the judgment 
-        # needs to be deleted!!!
-        if attrs[0]['data_format'] == "NHWC":
-            return False
-
        return True

    def sample_program_config(self, draw):
@@ -108,19 +99,6 @@ class TestConvGeluMkldnnFusePass(PassAutoScanTest):
        config = self.create_inference_config(use_mkldnn=True)
        yield config, ["conv2d"], (1e-5, 1e-5)

-    # If the problem has been fixed, the judgment 
-    # needs to be deleted!!!
-    def add_ignore_pass_case(self):
-        def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs['data_format'] == "NHWC":
-                return True
-            return False
-
-        self.add_ignore_check_case(
-            teller1, SkipReasons.PASS_ACCURACY_ERROR,
-            "The output format of conv2d is wrong when data_format attribute is NHWC"
-        )
-
    def test(self):
        self.run_and_statis(quant=False, passes=["conv_gelu_mkldnn_fuse_pass"])


--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py
@@ -27,15 +27,6 @@ import hypothesis.strategies as st

 class TestConvHardSigmoidMkldnnFusePass(PassAutoScanTest):
    def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
-        ]
-        # If the problem has been fixed, the judgment 
-        # needs to be deleted!!!
-        if attrs[0]['data_format'] == "NHWC":
-            return False
-
        return True

    def sample_program_config(self, draw):

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py
@@ -27,15 +27,6 @@ import hypothesis.strategies as st

 class TestConvHardSwishMkldnnFusePass(PassAutoScanTest):
    def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
-        ]
-        # If the problem has been fixed, the judgment 
-        # needs to be deleted!!!
-        if attrs[0]['data_format'] == "NHWC":
-            return False
-
        return True

    def sample_program_config(self, draw):

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py
@@ -32,9 +32,9 @@ class TestConvTransposeMkldnnFusePass(PassAutoScanTest):
            for i in range(len(program_config.ops))
        ]

-        # If the problem has been fixed, the judgment 
-        # needs to be deleted!!!
-        if attrs[0]['data_format'] == "NHWC":
+        if attrs[0]['data_format'] == "NCHW" and attrs[1]["axis"] == 3:
+            return False
+        if attrs[0]['data_format'] == "NHWC" and attrs[1]["axis"] == 1:
            return False

        return True
@@ -46,7 +46,7 @@ class TestConvTransposeMkldnnFusePass(PassAutoScanTest):
        groups = draw(st.sampled_from([1, 2, 4, 8]))
        paddings = draw(st.sampled_from([[0, 3], [1, 2, 3, 4]]))
        strides = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
-        axis = draw(st.sampled_from([1]))
+        axis = draw(st.sampled_from([1, 3]))
        batch_size = draw(st.integers(min_value=1, max_value=4))

        def generate_input():
@@ -110,7 +110,9 @@ class TestConvTransposeMkldnnFusePass(PassAutoScanTest):

    def test(self):
        self.run_and_statis(
-            quant=False, passes=["conv_transpose_bias_mkldnn_fuse_pass"])
+            quant=False,
+            max_duration=300,
+            passes=["conv_transpose_bias_mkldnn_fuse_pass"])


 if __name__ == "__main__":