未验证 提交 53d5abe3 编写于 作者: L limingshu 提交者: GitHub

Addition of switch_auto_tune option for transpose op (#43310)

* 2nd part of transpose update

* add switch_auto_tune option.

* add some changes according to Ci

* refine the structure of auto_tune_base.

* merge develop changes

* reset the switch_set_range and change unittest of transpose auto-tune

* change the kernel auto-tune logits
上级 fac6a5f0
......@@ -97,10 +97,9 @@ class FMHARef {
// input shape: [bs, seq_len, 3, num_head, head_dim]
// transpose with perm [2, 0, 3, 1, 4],
// output_shape: [3, bs, num_head, seq_len, head_dim]
int ndims = 5;
std::vector<int> perm_1 = {2, 0, 3, 1, 4};
TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, qkv_input_tensor, perm_1, transpose_2_out_tensor);
dev_ctx_, qkv_input_tensor, perm_1, transpose_2_out_tensor);
T* qkv_data = transpose_2_out_tensor->data<T>();
T* qk_out_data = qk_out_tensor->data<T>();
T* qktv_out_data = qktv_out_tensor->data<T>();
......@@ -255,9 +254,8 @@ class FMHARef {
// transpose: [0, 2, 1, 3]
// output shape: [batch_size, seq_len, num_heads, head_dim]
std::vector<int> perm_3 = {0, 2, 1, 3};
ndims = 4;
TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, *qktv_out_tensor, perm_3, fmha_out_tensor);
dev_ctx_, *qktv_out_tensor, perm_3, fmha_out_tensor);
}
void ComputeBackward(const Tensor& transpose_2_out_tensor,
......@@ -297,10 +295,9 @@ class FMHARef {
T* qktv_out_grad_data = qktv_out_grad_tensor->data<T>();
// transpose bw
int ndims = 4;
std::vector<int> perm_3 = {0, 2, 1, 3};
TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, fmha_out_grad_tensor, perm_3, qktv_out_grad_tensor);
dev_ctx_, fmha_out_grad_tensor, perm_3, qktv_out_grad_tensor);
// recall batchedgemm(nn) fw: softmax_out_data(x) * v_ptr(y) =
// qktv_out_data(out)
......@@ -476,13 +473,9 @@ class FMHARef {
stride_b);
// transpose bw
ndims = 5;
std::vector<int> perm_1 = {1, 3, 0, 2, 4};
TransposeGPUKernelDriver<T>(dev_ctx_,
ndims,
*transpose_2_out_grad_tensor,
perm_1,
qkv_input_grad_tensor);
TransposeGPUKernelDriver<T>(
dev_ctx_, *transpose_2_out_grad_tensor, perm_1, qkv_input_grad_tensor);
}
private:
......
......@@ -622,11 +622,10 @@ class FMHAGateRef {
Tensor* q_transpose_out,
Tensor* k_transpose_out,
Tensor* v_transpose_out) {
int ndims = 5;
std::vector<int> perm = {0, 1, 3, 2, 4};
TransposeGPUKernelDriver<T>(dev_ctx_, ndims, q_out, perm, q_transpose_out);
TransposeGPUKernelDriver<T>(dev_ctx_, ndims, k_out, perm, k_transpose_out);
TransposeGPUKernelDriver<T>(dev_ctx_, ndims, v_out, perm, v_transpose_out);
TransposeGPUKernelDriver<T>(dev_ctx_, q_out, perm, q_transpose_out);
TransposeGPUKernelDriver<T>(dev_ctx_, k_out, perm, k_transpose_out);
TransposeGPUKernelDriver<T>(dev_ctx_, v_out, perm, v_transpose_out);
}
void ComputeQKVTransposeBackward(const Tensor& q_transpose_out_grad,
......@@ -635,48 +634,41 @@ class FMHAGateRef {
Tensor* q_out_grad,
Tensor* k_out_grad,
Tensor* v_out_grad) {
int ndims = 5;
std::vector<int> perm = {0, 1, 3, 2, 4};
TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, q_transpose_out_grad, perm, q_out_grad);
dev_ctx_, q_transpose_out_grad, perm, q_out_grad);
TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, k_transpose_out_grad, perm, k_out_grad);
dev_ctx_, k_transpose_out_grad, perm, k_out_grad);
TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, v_transpose_out_grad, perm, v_out_grad);
dev_ctx_, v_transpose_out_grad, perm, v_out_grad);
}
// [batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim] ->
// [3, batch_size, seq_len_m, num_heads, seq_len_r, head_dim]
void ComputeQKVTransposeForward(const Tensor& qkv_out,
Tensor* qkv_transpose_out) {
int ndims = 6;
std::vector<int> perm = {3, 0, 1, 4, 2, 5};
TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, qkv_out, perm, qkv_transpose_out);
TransposeGPUKernelDriver<T>(dev_ctx_, qkv_out, perm, qkv_transpose_out);
}
void ComputeQKVTransposeBackward(const Tensor& qkv_transpose_out_grad,
Tensor* qkv_out_grad) {
int ndims = 6;
std::vector<int> perm = {1, 2, 4, 0, 3, 5};
TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, qkv_transpose_out_grad, perm, qkv_out_grad);
dev_ctx_, qkv_transpose_out_grad, perm, qkv_out_grad);
}
// [batch_size, seq_len_m, num_head, seq_len_r, c] ->
// [batch_size, seq_len_m, seq_len_r, num_head, c]
void ComputeQKTVTransposeForward(const Tensor& qktv_out, Tensor* fmha_out) {
int ndims = 5;
std::vector<int> perm = {0, 1, 3, 2, 4};
TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qktv_out, perm, fmha_out);
TransposeGPUKernelDriver<T>(dev_ctx_, qktv_out, perm, fmha_out);
}
void ComputeQKTVTransposeBackward(const Tensor& fmha_out_grad,
Tensor* qktv_out_grad) {
int ndims = 5;
std::vector<int> perm = {0, 1, 3, 2, 4};
TransposeGPUKernelDriver<T>(
dev_ctx_, ndims, fmha_out_grad, perm, qktv_out_grad);
TransposeGPUKernelDriver<T>(dev_ctx_, fmha_out_grad, perm, qktv_out_grad);
}
// qk_out = qk_out + nonbatched_bias + src_mask
......
......@@ -22,7 +22,6 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/autotune/auto_tune_base.h"
#include "paddle/phi/kernels/autotune/cache.h"
namespace paddle {
namespace operators {
......@@ -1155,50 +1154,31 @@ inline void SimplifyThenLaunch(const int rank,
}
template <typename T>
size_t GetTransposeKey(const int rank,
const Tensor& in,
const std::vector<int32_t>& perm) {
auto in_shape = phi::vectorize(in.dims());
return phi::autotune::GetKey(
in_shape, perm, rank, paddle::experimental::CppTypeToDataType<T>::Type());
}
template <typename T>
void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx,
const int rank,
void TransposeGPUKernelDriver(const phi::GPUContext& ctx,
const Tensor& in,
const std::vector<int32_t>& perm,
Tensor* out) {
PADDLE_ENFORCE_LT(
rank,
phi::DDim::kMaxRank,
platform::errors::OutOfRange(
"The maximum dimension rank of "
"tensor is expected to be less than %d, but here is %d.",
phi::DDim::kMaxRank,
rank));
auto ret = TransposeSimple<T>::run(dev_ctx, in, perm, out);
const int rank = perm.size();
auto ret = TransposeSimple<T>::run(ctx, in, perm, out);
if (!ret) {
auto* tuner = phi::autotune::MakeTransposeTuner<T>(
SimplifyThenLaunch<phi::GPUContext, T>);
if (!tuner->IsInit()) {
auto* tuner =
phi::autotune::MakeTransposeTuner<T>(TransCompute<phi::GPUContext, T>);
tuner->AddCallBack(
phi::autotune::MakeCallback<T>(TransCompute<phi::GPUContext, T>));
tuner->Finalize();
}
phi::autotune::MakeCallback<T>(SimplifyThenLaunch<phi::GPUContext, T>));
auto key = GetTransposeKey<T>(rank, in, perm);
auto& cache = phi::autotune::AutoTuneCache::Instance().GetTranspose();
if (cache.Find(key)) {
auto index = cache.Get(key);
tuner->RunBestKernel(index, rank, dev_ctx, in, out, perm);
} else {
// All avaliable kernels have ran while picking the best kernel, so
// there may be no need for another RunBestKernel.
auto index = tuner->PickBestKernel(dev_ctx, rank, dev_ctx, in, out, perm);
cache.Set(key, index);
}
size_t key = phi::autotune::TransposeKey(
phi::vectorize(in.dims()),
perm,
paddle::experimental::CppTypeToDataType<T>::Type());
tuner->Run(ctx,
phi::autotune::AlgorithmType::kTranspose,
key,
rank,
ctx,
in,
out,
perm);
}
}
......
......@@ -14,12 +14,10 @@
#pragma once
#include <mutex>
#include <type_traits>
#include "glog/logging.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/autotune/gpu_timer.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h"
namespace phi {
namespace autotune {
......@@ -51,33 +49,61 @@ class AutoTuneBase {
public:
AutoTuneBase() {}
virtual ~AutoTuneBase() {}
explicit AutoTuneBase(KernelType kernel) { kernels_.push_back(kernel); }
template <typename Type>
void AddCallBack(Type kernel) {
static_assert(std::is_same<Type, KernelType>::value,
"Type must be the same");
kernels_.push_back(kernel);
explicit AutoTuneBase(KernelType kernel) {
kernels_.push_back(/*default=*/kernel);
}
template <typename... Args>
void RunBestKernel(const int idx, Args&&... args) {
kernels_[idx].Run(args...);
void AddCallBack(KernelType kernel) {
if (!is_init_) {
std::lock_guard<std::mutex> lock(mutex_);
kernels_.push_back(kernel);
}
}
template <typename... Args>
void RunDefaultKernel(Args&&... args) {
template <typename Context, typename... Args>
void Run(const Context& ctx,
const AlgorithmType& algo,
const size_t key,
Args&&... args) {
PADDLE_ENFORCE_GT(
kernels_.size(),
0,
paddle::platform::errors::InvalidArgument(
"kernel num must be greater than 0, now is %d", kernels_.size()));
is_init_ = true;
auto& cache = AutoTuneCache::Instance().Get(algo);
if (cache.Find(key)) {
auto best_idx = cache.Get(key);
kernels_[best_idx].Run(args...);
} else {
bool use_autotune = AutoTuneStatus::Instance().UseAutoTune();
if (use_autotune) {
// All avaliable kernels have ran while picking the best kernel,
// so there may be no need for another kernel run.
auto best_idx = PickBestKernel(ctx, args...);
cache.Set(key, best_idx);
} else {
kernels_[0].Run(args...);
}
}
}
private:
bool is_init_{false};
std::vector<KernelType> kernels_;
mutable std::mutex mutex_;
template <typename Context, typename... Args>
int PickBestKernel(const Context& ctx, Args&&... args) {
size_t PickBestKernel(const Context& ctx, Args&&... args) {
std::lock_guard<std::mutex> lock(mutex_);
PADDLE_ENFORCE_GT(
kernels_.size(),
0,
paddle::platform::errors::InvalidArgument(
"kernel num must be greater than 0, now is %d", kernels_.size()));
int best_idx = 0;
size_t best_idx = 0;
float min_time = std::numeric_limits<float>::max();
// Time cost test estabulished in default stream.
......@@ -92,23 +118,15 @@ class AutoTuneBase {
return best_idx;
}
bool IsInit() { return is_init_; }
void Finalize() { is_init_ = true; }
private:
bool is_init_{false};
std::vector<KernelType> kernels_;
template <typename Context, typename... Args>
float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) {
// Regard 1st run as warmup. Judge the result by the time cost of rest run
// cycles.
constexpr int repeats = 3;
phi::GpuTimer timer;
float time_cost = 0;
const auto& stream = ctx.stream();
// Treat 1st run as warm up. Judge the result with
// the sum of 2nd and 3rd run.
constexpr int repeats = 3;
ctx.Wait();
for (int i = 0; i < repeats; ++i) {
timer.Start(stream);
......@@ -151,7 +169,7 @@ std::once_flag TransposeAutoTuner<T, KernelType>::init_flag_;
template <typename T, typename RetureType, typename... Args>
static AutoTuneBase<T, KernelCallback<T, RetureType, Args...>>*
MakeTransposeTuner(RetureType (*func)(Args...)) {
MakeTransposeTuner(RetureType (*func)(Args...)) {
auto obj = MakeCallback<T>(func);
return TransposeAutoTuner<T, decltype(obj)>::Instance(obj);
}
......
......@@ -131,24 +131,5 @@ TEST(AutoTune, sum) {
timer.Stop(0);
VLOG(3) << "kernel[" << i << "]: time cost is " << timer.ElapsedTime();
}
// 2. Test call_back tune.
VLOG(3) << ">>> [AutoTune]: Test case.";
auto tuner = tune::MakeAutoTuner<float>(Algo<4>);
tuner.AddCallBack(tune::MakeCallback<float>(Algo<2>));
tuner.AddCallBack(tune::MakeCallback<float>(Algo<1>));
/* The 1st ctx works for ctx.Wait(),
the 2nd is just the param of call_back. */
auto best_index = tuner.PickBestKernel(
*dev_ctx, *dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
dev_ctx->Wait();
phi::GpuTimer timer;
timer.Start(0);
tuner.RunBestKernel(
best_index, *dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
timer.Stop(0);
VLOG(3) << "Best CallBackKernel time cost is " << timer.ElapsedTime();
#endif
}
......@@ -36,6 +36,13 @@ size_t ConvKey(const std::vector<int64_t>& x_dims,
static_cast<int64_t>(dtype));
}
size_t TransposeKey(const std::vector<int64_t>& x_dims,
const std::vector<int32_t>& perm,
phi::DataType dtype) {
const auto rank = perm.size();
return GetKey(x_dims, perm, rank, static_cast<int64_t>(dtype));
}
std::string AlgorithmTypeString(int64_t algo_type) {
if (algo_type == static_cast<int64_t>(AlgorithmType::kConvForward)) {
return "conv_forward";
......
......@@ -68,6 +68,10 @@ size_t ConvKey(const std::vector<int64_t>& x_dims,
const std::vector<int>& dilations,
phi::DataType dtype);
size_t TransposeKey(const std::vector<int64_t>& x_dims,
const std::vector<int32_t>& perm,
phi::DataType dtype);
template <typename AlgorithmT>
class AlgorithmsCache {
public:
......
......@@ -29,6 +29,7 @@ void AutoTuneStatus::EnableAutoTune() {
void AutoTuneStatus::DisableAutoTune() {
FLAGS_use_autotune = false;
use_autotune_ = false;
Init();
}
......
......@@ -31,12 +31,11 @@ void TransposeKernel(const Context& ctx,
const DenseTensor& x,
const std::vector<int>& axis,
DenseTensor* out) {
int rank = axis.size();
ctx.template Alloc<T>(out);
if (out->numel() == 0) {
return;
}
paddle::operators::TransposeGPUKernelDriver<T>(ctx, rank, x, axis, out);
paddle::operators::TransposeGPUKernelDriver<T>(ctx, x, axis, out);
}
} // namespace phi
......
......@@ -126,6 +126,41 @@ class TestCase9(TestTransposeOp):
self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
class TestAutoTuneTransposeOp(OpTest):
def setUp(self):
self.init_op_type()
self.initTestCase()
self.python_api = paddle.transpose
self.inputs = {'X': np.random.random(self.shape).astype("float64")}
self.attrs = {
'axis': list(self.axis),
'use_mkldnn': self.use_mkldnn,
}
self.outputs = {
'XShape': np.random.random(self.shape).astype("float64"),
'Out': self.inputs['X'].transpose(self.axis)
}
def initTestCase(self):
fluid.core.set_autotune_range(0, 3)
fluid.core.update_autotune_status()
fluid.core.enable_autotune()
self.shape = (1, 12, 256, 1)
self.axis = (0, 3, 2, 1)
def init_op_type(self):
self.op_type = "transpose2"
self.use_mkldnn = False
def test_check_output(self):
self.check_output(no_check_set=['XShape'], check_eager=True)
fluid.core.disable_autotune()
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_eager=True)
class TestTransposeBF16Op(OpTest):
def setUp(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册