From 87cba48b69a24294e1e9f83ed750f44454ff45b6 Mon Sep 17 00:00:00 2001 From: limingshu <61349199+JamesLim-sy@users.noreply.github.com> Date: Wed, 7 Sep 2022 10:34:06 +0800 Subject: [PATCH] Performance fix for broadcast kernel [Part2] (#40051) * first commit * merged with develop * merged with develop * fix merge sequential one dims bugs --- paddle/phi/kernels/funcs/aligned_vector.h | 20 ++++++------- paddle/phi/kernels/funcs/broadcast_function.h | 28 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h index 70f75d5352..c931b90a92 100644 --- a/paddle/phi/kernels/funcs/aligned_vector.h +++ b/paddle/phi/kernels/funcs/aligned_vector.h @@ -54,20 +54,20 @@ HOSTDEVICE inline void Store(const AlignedVector& vec, T* addr) { template int GetVectorizedSize(const T* pointer) { constexpr int max_load_bits = 128; - int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T); + constexpr int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T); uint64_t address = reinterpret_cast(pointer); constexpr int vec8 = std::alignment_of>::value; // NOLINT constexpr int vec4 = std::alignment_of>::value; // NOLINT constexpr int vec2 = std::alignment_of>::value; // NOLINT - if (address % vec8 == 0) { - /* - * Currently, decide to deal with no more than 4 data once while adopting - * vectorization load/store, if performance test shows that dealing with - * 8 data once in vectorization load/store does get optimized, return code - * below can be changed into " return std::min(8, valid_vec_size); " . - */ - return std::min(4, valid_vec_size); - } else if (address % vec4 == 0) { + /* + * Currently, decide to deal with no more than 4 data once while adopting + * vectorization load/store, if performance test shows that dealing with + * 8 data once in vectorization load/store does get optimized, code below + * can begin with : + if (address % vec8 == 0) { + return std::min(4, valid_vec_size); + */ + if (address % vec4 == 0) { return std::min(4, valid_vec_size); } else if (address % vec2 == 0) { return std::min(2, valid_vec_size); diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 9b9d9e1d20..40dfb76586 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -125,7 +125,7 @@ struct DimensionsTransform { // To judge whether shape of any input tensors is sequential // 1-value-dimensions, and metric the length of it. - int GetSequentialOneDimLength(int *swap_index) { + bool FindSequentialOneDim(int *swap_index) { int index = 0; int max_one_length = 0; for (int j = 0; j < N; ++j) { @@ -144,16 +144,16 @@ struct DimensionsTransform { } } } - max_one_length = - seq_one_length > max_one_length ? seq_one_length : max_one_length; index = seq_one_length > max_one_length ? j : index; + max_one_length = std::max(seq_one_length, max_one_length); } - if (max_one_length > 1) { + bool has_seq_one = max_one_length > 1; + if (has_seq_one) { std::swap(in_dims[0], in_dims[index]); *swap_index = index; } - return max_one_length; + return has_seq_one; } public: @@ -214,8 +214,8 @@ struct DimensionsTransform { } }; int swap_idx = 0; - int max_one_length = GetSequentialOneDimLength(&swap_idx); - if (max_one_length > 1) { + bool has_seq_one = FindSequentialOneDim(&swap_idx); + if (has_seq_one) { merge_ptr = merge_sequential_one_dims; MergeDimensions(merge_ptr, N); std::swap(in_dims[swap_idx], in_dims[0]); @@ -223,13 +223,13 @@ struct DimensionsTransform { } }; -template +template int GetVecsize(const std::vector &ins, std::vector *outs) { int in_vec_size = 4; int out_vec_size = 4; - if (NumOuts > 1) { - for (int i = 0; i < NumOuts; ++i) { + if (outs->size() > 1) { + for (auto i = 1; i < outs->size(); ++i) { PADDLE_ENFORCE_EQ( (*outs)[i]->dims(), (*outs)[0]->dims(), @@ -295,7 +295,7 @@ __device__ void VectorizedBroadcastKernelImpl( __simd__ ConditionalT result[VecSize]; #pragma unroll - for (int i = 0; i < Arity; i++) { + for (int i = 0; i < Arity; ++i) { kps::Init(args[i], static_cast(1.0f), read_lens); LoadData(args[i], ins[i], @@ -433,7 +433,7 @@ void LaunchBroadcastKernel( outs_data[i] = (_ptr_ OutT *)(ctx.Alloc((*outs)[i])); } - for (int i = 0; i < Arity; i++) { + for (int i = 0; i < Arity; ++i) { use_broadcast[i] = (ins[i]->numel() != numel); ins_data[i] = (const _ptr_ InT *)(ins[i]->data()); } @@ -532,7 +532,7 @@ void BroadcastKernelForDifferentVecSize( bool is_optimize = configs[0].cmp_type != type; int vec_size = is_optimize ? VecSizeL : VecSizeM; #else - for (int i = 0; i < kArity; i++) { + for (int i = 0; i < kArity; ++i) { // get the broadcast config, // if data shape is[m, n], then you should set data_dim = {n, m} // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3} @@ -541,7 +541,7 @@ void BroadcastKernelForDifferentVecSize( merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size); } } - int vec_size = GetVecsize(ins, outs); + int vec_size = GetVecsize(ins, outs); #endif switch (vec_size) { -- GitLab