未验证 提交 53d5abe3 编写于 作者: L limingshu 提交者: GitHub

Addition of switch_auto_tune option for transpose op (#43310)

* 2nd part of transpose update

* add switch_auto_tune option.

* add some changes according to Ci

* refine the structure of auto_tune_base.

* merge develop changes

* reset the switch_set_range and change unittest of transpose auto-tune

* change the kernel auto-tune logits
上级 fac6a5f0
...@@ -97,10 +97,9 @@ class FMHARef { ...@@ -97,10 +97,9 @@ class FMHARef {
// input shape: [bs, seq_len, 3, num_head, head_dim] // input shape: [bs, seq_len, 3, num_head, head_dim]
// transpose with perm [2, 0, 3, 1, 4], // transpose with perm [2, 0, 3, 1, 4],
// output_shape: [3, bs, num_head, seq_len, head_dim] // output_shape: [3, bs, num_head, seq_len, head_dim]
int ndims = 5;
std::vector<int> perm_1 = {2, 0, 3, 1, 4}; std::vector<int> perm_1 = {2, 0, 3, 1, 4};
TransposeGPUKernelDriver<T>( TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, qkv_input_tensor, perm_1, transpose_2_out_tensor); dev_ctx_, qkv_input_tensor, perm_1, transpose_2_out_tensor);
T* qkv_data = transpose_2_out_tensor->data<T>(); T* qkv_data = transpose_2_out_tensor->data<T>();
T* qk_out_data = qk_out_tensor->data<T>(); T* qk_out_data = qk_out_tensor->data<T>();
T* qktv_out_data = qktv_out_tensor->data<T>(); T* qktv_out_data = qktv_out_tensor->data<T>();
...@@ -255,9 +254,8 @@ class FMHARef { ...@@ -255,9 +254,8 @@ class FMHARef {
// transpose: [0, 2, 1, 3] // transpose: [0, 2, 1, 3]
// output shape: [batch_size, seq_len, num_heads, head_dim] // output shape: [batch_size, seq_len, num_heads, head_dim]
std::vector<int> perm_3 = {0, 2, 1, 3}; std::vector<int> perm_3 = {0, 2, 1, 3};
ndims = 4;
TransposeGPUKernelDriver<T>( TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, *qktv_out_tensor, perm_3, fmha_out_tensor); dev_ctx_, *qktv_out_tensor, perm_3, fmha_out_tensor);
} }
void ComputeBackward(const Tensor& transpose_2_out_tensor, void ComputeBackward(const Tensor& transpose_2_out_tensor,
...@@ -297,10 +295,9 @@ class FMHARef { ...@@ -297,10 +295,9 @@ class FMHARef {
T* qktv_out_grad_data = qktv_out_grad_tensor->data<T>(); T* qktv_out_grad_data = qktv_out_grad_tensor->data<T>();
// transpose bw // transpose bw
int ndims = 4;
std::vector<int> perm_3 = {0, 2, 1, 3}; std::vector<int> perm_3 = {0, 2, 1, 3};
TransposeGPUKernelDriver<T>( TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, fmha_out_grad_tensor, perm_3, qktv_out_grad_tensor); dev_ctx_, fmha_out_grad_tensor, perm_3, qktv_out_grad_tensor);
// recall batchedgemm(nn) fw: softmax_out_data(x) * v_ptr(y) = // recall batchedgemm(nn) fw: softmax_out_data(x) * v_ptr(y) =
// qktv_out_data(out) // qktv_out_data(out)
...@@ -476,13 +473,9 @@ class FMHARef { ...@@ -476,13 +473,9 @@ class FMHARef {
stride_b); stride_b);
// transpose bw // transpose bw
ndims = 5;
std::vector<int> perm_1 = {1, 3, 0, 2, 4}; std::vector<int> perm_1 = {1, 3, 0, 2, 4};
TransposeGPUKernelDriver<T>(dev_ctx_, TransposeGPUKernelDriver<T>(
ndims, dev_ctx_, *transpose_2_out_grad_tensor, perm_1, qkv_input_grad_tensor);
*transpose_2_out_grad_tensor,
perm_1,
qkv_input_grad_tensor);
} }
private: private:
......
...@@ -622,11 +622,10 @@ class FMHAGateRef { ...@@ -622,11 +622,10 @@ class FMHAGateRef {
Tensor* q_transpose_out, Tensor* q_transpose_out,
Tensor* k_transpose_out, Tensor* k_transpose_out,
Tensor* v_transpose_out) { Tensor* v_transpose_out) {
int ndims = 5;
std::vector<int> perm = {0, 1, 3, 2, 4}; std::vector<int> perm = {0, 1, 3, 2, 4};
TransposeGPUKernelDriver<T>(dev_ctx_, ndims, q_out, perm, q_transpose_out); TransposeGPUKernelDriver<T>(dev_ctx_, q_out, perm, q_transpose_out);
TransposeGPUKernelDriver<T>(dev_ctx_, ndims, k_out, perm, k_transpose_out); TransposeGPUKernelDriver<T>(dev_ctx_, k_out, perm, k_transpose_out);
TransposeGPUKernelDriver<T>(dev_ctx_, ndims, v_out, perm, v_transpose_out); TransposeGPUKernelDriver<T>(dev_ctx_, v_out, perm, v_transpose_out);
} }
void ComputeQKVTransposeBackward(const Tensor& q_transpose_out_grad, void ComputeQKVTransposeBackward(const Tensor& q_transpose_out_grad,
...@@ -635,48 +634,41 @@ class FMHAGateRef { ...@@ -635,48 +634,41 @@ class FMHAGateRef {
Tensor* q_out_grad, Tensor* q_out_grad,
Tensor* k_out_grad, Tensor* k_out_grad,
Tensor* v_out_grad) { Tensor* v_out_grad) {
int ndims = 5;
std::vector<int> perm = {0, 1, 3, 2, 4}; std::vector<int> perm = {0, 1, 3, 2, 4};
TransposeGPUKernelDriver<T>( TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, q_transpose_out_grad, perm, q_out_grad); dev_ctx_, q_transpose_out_grad, perm, q_out_grad);
TransposeGPUKernelDriver<T>( TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, k_transpose_out_grad, perm, k_out_grad); dev_ctx_, k_transpose_out_grad, perm, k_out_grad);
TransposeGPUKernelDriver<T>( TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, v_transpose_out_grad, perm, v_out_grad); dev_ctx_, v_transpose_out_grad, perm, v_out_grad);
} }
// [batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim] -> // [batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim] ->
// [3, batch_size, seq_len_m, num_heads, seq_len_r, head_dim] // [3, batch_size, seq_len_m, num_heads, seq_len_r, head_dim]
void ComputeQKVTransposeForward(const Tensor& qkv_out, void ComputeQKVTransposeForward(const Tensor& qkv_out,
Tensor* qkv_transpose_out) { Tensor* qkv_transpose_out) {
int ndims = 6;
std::vector<int> perm = {3, 0, 1, 4, 2, 5}; std::vector<int> perm = {3, 0, 1, 4, 2, 5};
TransposeGPUKernelDriver<T>( TransposeGPUKernelDriver<T>(dev_ctx_, qkv_out, perm, qkv_transpose_out);
dev_ctx_, ndims, qkv_out, perm, qkv_transpose_out);
} }
void ComputeQKVTransposeBackward(const Tensor& qkv_transpose_out_grad, void ComputeQKVTransposeBackward(const Tensor& qkv_transpose_out_grad,
Tensor* qkv_out_grad) { Tensor* qkv_out_grad) {
int ndims = 6;
std::vector<int> perm = {1, 2, 4, 0, 3, 5}; std::vector<int> perm = {1, 2, 4, 0, 3, 5};
TransposeGPUKernelDriver<T>( TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, qkv_transpose_out_grad, perm, qkv_out_grad); dev_ctx_, qkv_transpose_out_grad, perm, qkv_out_grad);
} }
// [batch_size, seq_len_m, num_head, seq_len_r, c] -> // [batch_size, seq_len_m, num_head, seq_len_r, c] ->
// [batch_size, seq_len_m, seq_len_r, num_head, c] // [batch_size, seq_len_m, seq_len_r, num_head, c]
void ComputeQKTVTransposeForward(const Tensor& qktv_out, Tensor* fmha_out) { void ComputeQKTVTransposeForward(const Tensor& qktv_out, Tensor* fmha_out) {
int ndims = 5;
std::vector<int> perm = {0, 1, 3, 2, 4}; std::vector<int> perm = {0, 1, 3, 2, 4};
TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qktv_out, perm, fmha_out); TransposeGPUKernelDriver<T>(dev_ctx_, qktv_out, perm, fmha_out);
} }
void ComputeQKTVTransposeBackward(const Tensor& fmha_out_grad, void ComputeQKTVTransposeBackward(const Tensor& fmha_out_grad,
Tensor* qktv_out_grad) { Tensor* qktv_out_grad) {
int ndims = 5;
std::vector<int> perm = {0, 1, 3, 2, 4}; std::vector<int> perm = {0, 1, 3, 2, 4};
TransposeGPUKernelDriver<T>( TransposeGPUKernelDriver<T>(dev_ctx_, fmha_out_grad, perm, qktv_out_grad);
dev_ctx_, ndims, fmha_out_grad, perm, qktv_out_grad);
} }
// qk_out = qk_out + nonbatched_bias + src_mask // qk_out = qk_out + nonbatched_bias + src_mask
......
...@@ -22,7 +22,6 @@ limitations under the License. */ ...@@ -22,7 +22,6 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/autotune/auto_tune_base.h" #include "paddle/phi/kernels/autotune/auto_tune_base.h"
#include "paddle/phi/kernels/autotune/cache.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -1155,50 +1154,31 @@ inline void SimplifyThenLaunch(const int rank, ...@@ -1155,50 +1154,31 @@ inline void SimplifyThenLaunch(const int rank,
} }
template <typename T> template <typename T>
size_t GetTransposeKey(const int rank, void TransposeGPUKernelDriver(const phi::GPUContext& ctx,
const Tensor& in,
const std::vector<int32_t>& perm) {
auto in_shape = phi::vectorize(in.dims());
return phi::autotune::GetKey(
in_shape, perm, rank, paddle::experimental::CppTypeToDataType<T>::Type());
}
template <typename T>
void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx,
const int rank,
const Tensor& in, const Tensor& in,
const std::vector<int32_t>& perm, const std::vector<int32_t>& perm,
Tensor* out) { Tensor* out) {
PADDLE_ENFORCE_LT( const int rank = perm.size();
rank, auto ret = TransposeSimple<T>::run(ctx, in, perm, out);
phi::DDim::kMaxRank,
platform::errors::OutOfRange(
"The maximum dimension rank of "
"tensor is expected to be less than %d, but here is %d.",
phi::DDim::kMaxRank,
rank));
auto ret = TransposeSimple<T>::run(dev_ctx, in, perm, out);
if (!ret) { if (!ret) {
auto* tuner = phi::autotune::MakeTransposeTuner<T>( auto* tuner =
SimplifyThenLaunch<phi::GPUContext, T>); phi::autotune::MakeTransposeTuner<T>(TransCompute<phi::GPUContext, T>);
if (!tuner->IsInit()) { tuner->AddCallBack(
tuner->AddCallBack( phi::autotune::MakeCallback<T>(SimplifyThenLaunch<phi::GPUContext, T>));
phi::autotune::MakeCallback<T>(TransCompute<phi::GPUContext, T>));
tuner->Finalize(); size_t key = phi::autotune::TransposeKey(
} phi::vectorize(in.dims()),
perm,
auto key = GetTransposeKey<T>(rank, in, perm); paddle::experimental::CppTypeToDataType<T>::Type());
auto& cache = phi::autotune::AutoTuneCache::Instance().GetTranspose();
if (cache.Find(key)) { tuner->Run(ctx,
auto index = cache.Get(key); phi::autotune::AlgorithmType::kTranspose,
tuner->RunBestKernel(index, rank, dev_ctx, in, out, perm); key,
} else { rank,
// All avaliable kernels have ran while picking the best kernel, so ctx,
// there may be no need for another RunBestKernel. in,
auto index = tuner->PickBestKernel(dev_ctx, rank, dev_ctx, in, out, perm); out,
cache.Set(key, index); perm);
}
} }
} }
......
...@@ -14,12 +14,10 @@ ...@@ -14,12 +14,10 @@
#pragma once #pragma once
#include <mutex>
#include <type_traits> #include <type_traits>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/autotune/gpu_timer.h" #include "paddle/phi/kernels/autotune/gpu_timer.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h"
namespace phi { namespace phi {
namespace autotune { namespace autotune {
...@@ -51,33 +49,61 @@ class AutoTuneBase { ...@@ -51,33 +49,61 @@ class AutoTuneBase {
public: public:
AutoTuneBase() {} AutoTuneBase() {}
virtual ~AutoTuneBase() {} virtual ~AutoTuneBase() {}
explicit AutoTuneBase(KernelType kernel) { kernels_.push_back(kernel); }
template <typename Type> explicit AutoTuneBase(KernelType kernel) {
void AddCallBack(Type kernel) { kernels_.push_back(/*default=*/kernel);
static_assert(std::is_same<Type, KernelType>::value,
"Type must be the same");
kernels_.push_back(kernel);
} }
template <typename... Args> void AddCallBack(KernelType kernel) {
void RunBestKernel(const int idx, Args&&... args) { if (!is_init_) {
kernels_[idx].Run(args...); std::lock_guard<std::mutex> lock(mutex_);
kernels_.push_back(kernel);
}
} }
template <typename... Args> template <typename Context, typename... Args>
void RunDefaultKernel(Args&&... args) { void Run(const Context& ctx,
kernels_[0].Run(args...); const AlgorithmType& algo,
const size_t key,
Args&&... args) {
PADDLE_ENFORCE_GT(
kernels_.size(),
0,
paddle::platform::errors::InvalidArgument(
"kernel num must be greater than 0, now is %d", kernels_.size()));
is_init_ = true;
auto& cache = AutoTuneCache::Instance().Get(algo);
if (cache.Find(key)) {
auto best_idx = cache.Get(key);
kernels_[best_idx].Run(args...);
} else {
bool use_autotune = AutoTuneStatus::Instance().UseAutoTune();
if (use_autotune) {
// All avaliable kernels have ran while picking the best kernel,
// so there may be no need for another kernel run.
auto best_idx = PickBestKernel(ctx, args...);
cache.Set(key, best_idx);
} else {
kernels_[0].Run(args...);
}
}
} }
private:
bool is_init_{false};
std::vector<KernelType> kernels_;
mutable std::mutex mutex_;
template <typename Context, typename... Args> template <typename Context, typename... Args>
int PickBestKernel(const Context& ctx, Args&&... args) { size_t PickBestKernel(const Context& ctx, Args&&... args) {
std::lock_guard<std::mutex> lock(mutex_);
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
kernels_.size(), kernels_.size(),
0, 0,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"kernel num must be greater than 0, now is %d", kernels_.size())); "kernel num must be greater than 0, now is %d", kernels_.size()));
int best_idx = 0; size_t best_idx = 0;
float min_time = std::numeric_limits<float>::max(); float min_time = std::numeric_limits<float>::max();
// Time cost test estabulished in default stream. // Time cost test estabulished in default stream.
...@@ -92,23 +118,15 @@ class AutoTuneBase { ...@@ -92,23 +118,15 @@ class AutoTuneBase {
return best_idx; return best_idx;
} }
bool IsInit() { return is_init_; }
void Finalize() { is_init_ = true; }
private:
bool is_init_{false};
std::vector<KernelType> kernels_;
template <typename Context, typename... Args> template <typename Context, typename... Args>
float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) { float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) {
// Regard 1st run as warmup. Judge the result by the time cost of rest run
// cycles.
constexpr int repeats = 3;
phi::GpuTimer timer; phi::GpuTimer timer;
float time_cost = 0; float time_cost = 0;
const auto& stream = ctx.stream(); const auto& stream = ctx.stream();
// Treat 1st run as warm up. Judge the result with
// the sum of 2nd and 3rd run.
constexpr int repeats = 3;
ctx.Wait(); ctx.Wait();
for (int i = 0; i < repeats; ++i) { for (int i = 0; i < repeats; ++i) {
timer.Start(stream); timer.Start(stream);
...@@ -151,7 +169,7 @@ std::once_flag TransposeAutoTuner<T, KernelType>::init_flag_; ...@@ -151,7 +169,7 @@ std::once_flag TransposeAutoTuner<T, KernelType>::init_flag_;
template <typename T, typename RetureType, typename... Args> template <typename T, typename RetureType, typename... Args>
static AutoTuneBase<T, KernelCallback<T, RetureType, Args...>>* static AutoTuneBase<T, KernelCallback<T, RetureType, Args...>>*
MakeTransposeTuner(RetureType (*func)(Args...)) { MakeTransposeTuner(RetureType (*func)(Args...)) {
auto obj = MakeCallback<T>(func); auto obj = MakeCallback<T>(func);
return TransposeAutoTuner<T, decltype(obj)>::Instance(obj); return TransposeAutoTuner<T, decltype(obj)>::Instance(obj);
} }
......
...@@ -131,24 +131,5 @@ TEST(AutoTune, sum) { ...@@ -131,24 +131,5 @@ TEST(AutoTune, sum) {
timer.Stop(0); timer.Stop(0);
VLOG(3) << "kernel[" << i << "]: time cost is " << timer.ElapsedTime(); VLOG(3) << "kernel[" << i << "]: time cost is " << timer.ElapsedTime();
} }
// 2. Test call_back tune.
VLOG(3) << ">>> [AutoTune]: Test case.";
auto tuner = tune::MakeAutoTuner<float>(Algo<4>);
tuner.AddCallBack(tune::MakeCallback<float>(Algo<2>));
tuner.AddCallBack(tune::MakeCallback<float>(Algo<1>));
/* The 1st ctx works for ctx.Wait(),
the 2nd is just the param of call_back. */
auto best_index = tuner.PickBestKernel(
*dev_ctx, *dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
dev_ctx->Wait();
phi::GpuTimer timer;
timer.Start(0);
tuner.RunBestKernel(
best_index, *dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
timer.Stop(0);
VLOG(3) << "Best CallBackKernel time cost is " << timer.ElapsedTime();
#endif #endif
} }
...@@ -36,6 +36,13 @@ size_t ConvKey(const std::vector<int64_t>& x_dims, ...@@ -36,6 +36,13 @@ size_t ConvKey(const std::vector<int64_t>& x_dims,
static_cast<int64_t>(dtype)); static_cast<int64_t>(dtype));
} }
size_t TransposeKey(const std::vector<int64_t>& x_dims,
const std::vector<int32_t>& perm,
phi::DataType dtype) {
const auto rank = perm.size();
return GetKey(x_dims, perm, rank, static_cast<int64_t>(dtype));
}
std::string AlgorithmTypeString(int64_t algo_type) { std::string AlgorithmTypeString(int64_t algo_type) {
if (algo_type == static_cast<int64_t>(AlgorithmType::kConvForward)) { if (algo_type == static_cast<int64_t>(AlgorithmType::kConvForward)) {
return "conv_forward"; return "conv_forward";
......
...@@ -68,6 +68,10 @@ size_t ConvKey(const std::vector<int64_t>& x_dims, ...@@ -68,6 +68,10 @@ size_t ConvKey(const std::vector<int64_t>& x_dims,
const std::vector<int>& dilations, const std::vector<int>& dilations,
phi::DataType dtype); phi::DataType dtype);
size_t TransposeKey(const std::vector<int64_t>& x_dims,
const std::vector<int32_t>& perm,
phi::DataType dtype);
template <typename AlgorithmT> template <typename AlgorithmT>
class AlgorithmsCache { class AlgorithmsCache {
public: public:
......
...@@ -29,6 +29,7 @@ void AutoTuneStatus::EnableAutoTune() { ...@@ -29,6 +29,7 @@ void AutoTuneStatus::EnableAutoTune() {
void AutoTuneStatus::DisableAutoTune() { void AutoTuneStatus::DisableAutoTune() {
FLAGS_use_autotune = false; FLAGS_use_autotune = false;
use_autotune_ = false;
Init(); Init();
} }
......
...@@ -31,12 +31,11 @@ void TransposeKernel(const Context& ctx, ...@@ -31,12 +31,11 @@ void TransposeKernel(const Context& ctx,
const DenseTensor& x, const DenseTensor& x,
const std::vector<int>& axis, const std::vector<int>& axis,
DenseTensor* out) { DenseTensor* out) {
int rank = axis.size();
ctx.template Alloc<T>(out); ctx.template Alloc<T>(out);
if (out->numel() == 0) { if (out->numel() == 0) {
return; return;
} }
paddle::operators::TransposeGPUKernelDriver<T>(ctx, rank, x, axis, out); paddle::operators::TransposeGPUKernelDriver<T>(ctx, x, axis, out);
} }
} // namespace phi } // namespace phi
......
...@@ -126,6 +126,41 @@ class TestCase9(TestTransposeOp): ...@@ -126,6 +126,41 @@ class TestCase9(TestTransposeOp):
self.axis = (6, 1, 3, 5, 0, 2, 4, 7) self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
class TestAutoTuneTransposeOp(OpTest):
def setUp(self):
self.init_op_type()
self.initTestCase()
self.python_api = paddle.transpose
self.inputs = {'X': np.random.random(self.shape).astype("float64")}
self.attrs = {
'axis': list(self.axis),
'use_mkldnn': self.use_mkldnn,
}
self.outputs = {
'XShape': np.random.random(self.shape).astype("float64"),
'Out': self.inputs['X'].transpose(self.axis)
}
def initTestCase(self):
fluid.core.set_autotune_range(0, 3)
fluid.core.update_autotune_status()
fluid.core.enable_autotune()
self.shape = (1, 12, 256, 1)
self.axis = (0, 3, 2, 1)
def init_op_type(self):
self.op_type = "transpose2"
self.use_mkldnn = False
def test_check_output(self):
self.check_output(no_check_set=['XShape'], check_eager=True)
fluid.core.disable_autotune()
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_eager=True)
class TestTransposeBF16Op(OpTest): class TestTransposeBF16Op(OpTest):
def setUp(self): def setUp(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册