From 87cba48b69a24294e1e9f83ed750f44454ff45b6 Mon Sep 17 00:00:00 2001
From: limingshu <61349199+JamesLim-sy@users.noreply.github.com>
Date: Wed, 7 Sep 2022 10:34:06 +0800
Subject: [PATCH] Performance fix for broadcast kernel [Part2] (#40051)

* first commit

* merged with develop

* merged with develop

* fix merge sequential one dims bugs
---
 paddle/phi/kernels/funcs/aligned_vector.h     | 20 ++++++-------
 paddle/phi/kernels/funcs/broadcast_function.h | 28 +++++++++----------
 2 files changed, 24 insertions(+), 24 deletions(-)
diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h
index 70f75d5352..c931b90a92 100644
--- a/paddle/phi/kernels/funcs/aligned_vector.h
+++ b/paddle/phi/kernels/funcs/aligned_vector.h
@@ -54,20 +54,20 @@ HOSTDEVICE inline void Store(const AlignedVector<T, Size>& vec, T* addr) {
 template <typename T>
 int GetVectorizedSize(const T* pointer) {
   constexpr int max_load_bits = 128;
-  int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
+  constexpr int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
   uint64_t address = reinterpret_cast<uint64_t>(pointer);
   constexpr int vec8 = std::alignment_of<AlignedVector<T, 8>>::value;  // NOLINT
   constexpr int vec4 = std::alignment_of<AlignedVector<T, 4>>::value;  // NOLINT
   constexpr int vec2 = std::alignment_of<AlignedVector<T, 2>>::value;  // NOLINT
-  if (address % vec8 == 0) {
-    /*
-     * Currently, decide to deal with no more than 4 data once while adopting
-     * vectorization load/store, if performance test shows that dealing with
-     * 8 data once in vectorization load/store does get optimized, return code
-     * below can be changed into " return std::min(8, valid_vec_size); " .
-     */
-    return std::min(4, valid_vec_size);
-  } else if (address % vec4 == 0) {
+  /*
+    * Currently, decide to deal with no more than 4 data once while adopting
+    * vectorization load/store, if performance test shows that dealing with
+    * 8 data once in vectorization load/store does get optimized, code below
+    * can begin with :
+      if (address % vec8 == 0) {
+        return std::min(4, valid_vec_size);
+    */
+  if (address % vec4 == 0) {
     return std::min(4, valid_vec_size);
   } else if (address % vec2 == 0) {
     return std::min(2, valid_vec_size);
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 9b9d9e1d20..40dfb76586 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -125,7 +125,7 @@ struct DimensionsTransform {
 
   // To judge whether shape of any input tensors is sequential
   // 1-value-dimensions, and metric the length of it.
-  int GetSequentialOneDimLength(int *swap_index) {
+  bool FindSequentialOneDim(int *swap_index) {
     int index = 0;
     int max_one_length = 0;
     for (int j = 0; j < N; ++j) {
@@ -144,16 +144,16 @@ struct DimensionsTransform {
           }
         }
       }
-      max_one_length =
-          seq_one_length > max_one_length ? seq_one_length : max_one_length;
       index = seq_one_length > max_one_length ? j : index;
+      max_one_length = std::max(seq_one_length, max_one_length);
     }
 
-    if (max_one_length > 1) {
+    bool has_seq_one = max_one_length > 1;
+    if (has_seq_one) {
       std::swap(in_dims[0], in_dims[index]);
       *swap_index = index;
     }
-    return max_one_length;
+    return has_seq_one;
   }
 
  public:
@@ -214,8 +214,8 @@ struct DimensionsTransform {
       }
     };
     int swap_idx = 0;
-    int max_one_length = GetSequentialOneDimLength(&swap_idx);
-    if (max_one_length > 1) {
+    bool has_seq_one = FindSequentialOneDim(&swap_idx);
+    if (has_seq_one) {
       merge_ptr = merge_sequential_one_dims;
       MergeDimensions<MergeFunctor>(merge_ptr, N);
       std::swap(in_dims[swap_idx], in_dims[0]);
@@ -223,13 +223,13 @@ struct DimensionsTransform {
   }
 };
 
-template <typename InT, typename OutT, int NumOuts = 1>
+template <typename InT, typename OutT>
 int GetVecsize(const std::vector<const DenseTensor *> &ins,
                std::vector<DenseTensor *> *outs) {
   int in_vec_size = 4;
   int out_vec_size = 4;
-  if (NumOuts > 1) {
-    for (int i = 0; i < NumOuts; ++i) {
+  if (outs->size() > 1) {
+    for (auto i = 1; i < outs->size(); ++i) {
       PADDLE_ENFORCE_EQ(
           (*outs)[i]->dims(),
           (*outs)[0]->dims(),
@@ -295,7 +295,7 @@ __device__ void VectorizedBroadcastKernelImpl(
   __simd__ ConditionalT<OutT, NumOuts> result[VecSize];
 
 #pragma unroll
-  for (int i = 0; i < Arity; i++) {
+  for (int i = 0; i < Arity; ++i) {
     kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f), read_lens);
     LoadData<InT, VecSize, IsBoundary>(args[i],
                                        ins[i],
@@ -433,7 +433,7 @@ void LaunchBroadcastKernel(
     outs_data[i] = (_ptr_ OutT *)(ctx.Alloc<OutT>((*outs)[i]));
   }
 
-  for (int i = 0; i < Arity; i++) {
+  for (int i = 0; i < Arity; ++i) {
     use_broadcast[i] = (ins[i]->numel() != numel);
     ins_data[i] = (const _ptr_ InT *)(ins[i]->data<InT>());
   }
@@ -532,7 +532,7 @@ void BroadcastKernelForDifferentVecSize(
   bool is_optimize = configs[0].cmp_type != type;
   int vec_size = is_optimize ? VecSizeL : VecSizeM;
 #else
-  for (int i = 0; i < kArity; i++) {
+  for (int i = 0; i < kArity; ++i) {
     // get the broadcast config,
     // if data shape is[m, n], then you should set data_dim = {n, m}
     // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
@@ -541,7 +541,7 @@ void BroadcastKernelForDifferentVecSize(
           merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size);
     }
   }
-  int vec_size = GetVecsize<InT, OutT, NumOuts>(ins, outs);
+  int vec_size = GetVecsize<InT, OutT>(ins, outs);
 #endif
 
   switch (vec_size) {
-- 
GitLab