未验证 提交 819f8939 编写于 作者: L limingshu 提交者: GitHub

Cache for cublaslt descriptor (#50931)

* first commit

* finish base work

* modification for good

* fix for cache setting and gather the algo and desc as one data for cache storage

* fix for cache setting and gather the algo and desc as one data for cache storage

* install pre-commit check
上级 25d3ed65
......@@ -60,10 +60,6 @@ size_t GenKey(Args&&... args) {
return seed;
}
struct MatmulHashValueType {
uint64_t data[8];
};
struct MatmulCacheKey {
public:
MatmulCacheKey() {}
......@@ -79,7 +75,7 @@ struct MatmulCacheKey {
static_cast<int64_t>(dtype));
}
size_t GetKey() const { return key; }
size_t GetSubKey(int64_t idx) const { return GenKey(key, idx); }
size_t GenSubKey(int64_t idx) const { return GenKey(key, idx); }
private:
size_t key;
......@@ -249,22 +245,22 @@ class MatmulAlgorithmsCache : public AlgorithmsCache<KeyT, AlgorithmT> {
return ret;
}
void SetSubKey(const KeyT& sub_key, const MatmulHashValueType* algo) {
void SetSubKey(const KeyT& sub_key, void* algo) {
std::lock_guard<std::mutex> lock(*(this->cache_mutex_));
sub_hash_[sub_key] = *algo;
sub_hash_[sub_key] = algo;
}
MatmulHashValueType* GetSubKey(const KeyT& sub_key) {
void* GetSubKey(const KeyT& sub_key) {
std::lock_guard<std::mutex> lock(*(this->cache_mutex_));
PADDLE_ENFORCE_NE(
sub_hash_.find(sub_key),
sub_hash_.end(),
phi::errors::PreconditionNotMet("The key does not exist."));
return &(sub_hash_[sub_key]);
return sub_hash_[sub_key];
}
private:
std::unordered_map<KeyT, MatmulHashValueType> sub_hash_;
std::unordered_map<KeyT, void*> sub_hash_;
};
} // namespace autotune
......
......@@ -26,7 +26,7 @@ limitations under the License. */
namespace phi {
namespace funcs {
enum MatmulImplType { kImplWithCublas = 1, kImplWithCublasLt = 2 };
enum MatmulImplType { kCublas = 1, kCublasLt = 2 };
template <typename T>
cublasComputeType_t GetCudaComputeType() {
......@@ -43,7 +43,35 @@ struct MatmulDescriptor {
cublasLtMatrixLayout_t x_desc{nullptr};
cublasLtMatrixLayout_t y_desc{nullptr};
cublasLtMatrixLayout_t out_desc{nullptr};
cublasLtMatmulAlgo_t* algo{nullptr};
MatmulDescriptor() {}
MatmulDescriptor(const MatmulDescriptor& obj) {
algo = obj.algo;
x_desc = obj.x_desc;
y_desc = obj.y_desc;
op_desc = obj.op_desc;
out_desc = obj.out_desc;
}
~MatmulDescriptor() {
if (!is_cached) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescDestroy(op_desc));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(y_desc));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(x_desc));
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cublasLtMatrixLayoutDestroy(out_desc));
delete algo;
op_desc = nullptr;
x_desc = nullptr;
y_desc = nullptr;
out_desc = nullptr;
algo = nullptr;
}
}
// x_desc, y_desc, op_desc are allocated in heap memory.
template <typename T>
void Create(const int M,
const int N,
......@@ -90,6 +118,15 @@ struct MatmulDescriptor {
}
}
cublasLtMatmulAlgo_t* SetAlgo() {
algo = new cublasLtMatmulAlgo_t;
return algo;
}
void ValidateCache() { is_cached = true; }
private:
bool is_cached{false};
void CreateMatrixLayout(cublasLtMatrixLayout_t* desc,
cudaDataType type,
uint64_t rows,
......@@ -118,17 +155,73 @@ struct MatmulDescriptor {
&stride,
sizeof(stride)));
}
};
void Release() {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(y_desc));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(x_desc));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(out_desc));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescDestroy(op_desc));
inline std::string GetDescResultString(std::string prefix,
const MatmulDescriptor* desc,
bool has_algo = true) {
std::ostringstream out;
out << prefix << " \n";
#define GET_DESC_DATA_INFO(src) \
do { \
out << "#data " \
<< "= ["; \
int num = sizeof((*src)) / sizeof(src->data[0]); \
for (int i = 0; i < num; ++i) { \
out << src->data[i] << ", "; \
} \
out << "]\n"; \
} while (0);
if (has_algo) {
GET_DESC_DATA_INFO(desc->algo);
}
GET_DESC_DATA_INFO(desc->x_desc);
GET_DESC_DATA_INFO(desc->y_desc);
GET_DESC_DATA_INFO(desc->out_desc);
GET_DESC_DATA_INFO(desc->op_desc);
return out.str();
}
op_desc = nullptr;
x_desc = nullptr;
y_desc = nullptr;
out_desc = nullptr;
template <typename T>
struct DescriptorSetter {
MatmulDescriptor* desc{nullptr};
size_t sub_key{std::numeric_limits<size_t>::min()};
DescriptorSetter(phi::autotune::MatmulCacheKey* matmul_key,
MatmulDescriptor* desc_ptr,
const int M,
const int N,
const int K,
const bool trans_x,
const bool trans_y,
const int batch_size = 1,
int64_t stride_x = 0,
int64_t stride_y = 0,
int64_t stride_out = 0) {
if (matmul_key != nullptr) {
sub_key =
matmul_key->GenSubKey(static_cast<size_t>(MatmulImplType::kCublasLt));
}
auto& mamtul_cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
if (mamtul_cache.FindSubKey(sub_key)) {
desc =
reinterpret_cast<MatmulDescriptor*>(mamtul_cache.GetSubKey(sub_key));
VLOG(4) << GetDescResultString("[Heap MatmulDescriptor] ", desc);
} else {
desc_ptr->Create<T>(M,
N,
K,
trans_x,
trans_y,
batch_size,
stride_x,
stride_y,
stride_out);
desc = desc_ptr;
VLOG(4) << GetDescResultString("[Stack MatmulDescriptor] ", desc, false);
}
}
};
......@@ -148,9 +241,10 @@ struct MatmulWithCublasLt {
const bool trans_y,
phi::autotune::MatmulCacheKey* matmul_key = nullptr) {
MatmulDescriptor desc;
desc.Create<T>(M, N, K, trans_x, trans_y);
RunImpl(ctx, desc, x_data, y_data, out_data, matmul_key);
desc.Release();
auto setter =
DescriptorSetter<T>(matmul_key, &desc, M, N, K, trans_x, trans_y);
RunImpl(
ctx, setter.desc, x_data, y_data, out_data, setter.sub_key, matmul_key);
}
static void RunWithBatch(
......@@ -169,10 +263,19 @@ struct MatmulWithCublasLt {
int64_t stride_out,
phi::autotune::MatmulCacheKey* matmul_key = nullptr) {
MatmulDescriptor desc;
desc.Create<T>(
M, N, K, trans_x, trans_y, batch_size, stride_x, stride_y, stride_out);
RunImpl(ctx, desc, x_data, y_data, out_data, matmul_key);
desc.Release();
auto setter = DescriptorSetter<T>(matmul_key,
&desc,
M,
N,
K,
trans_x,
trans_y,
batch_size,
stride_x,
stride_y,
stride_out);
RunImpl(
ctx, setter.desc, x_data, y_data, out_data, setter.sub_key, matmul_key);
}
static void RunWithBatch(
......@@ -211,88 +314,76 @@ struct MatmulWithCublasLt {
}
static void RunImpl(const phi::GPUContext& ctx,
const MatmulDescriptor& desc,
MatmulDescriptor* desc,
const T* x_ptr,
const T* y_ptr,
T* out_ptr,
const size_t sub_key,
phi::autotune::MatmulCacheKey* matmul_key = nullptr) {
MT alpha = static_cast<MT>(1);
MT beta = static_cast<MT>(0);
cublasLtHandle_t cublaslt_handle = ctx.cublaslt_handle();
cublasLtMatmulAlgo_t* best_algo = nullptr;
size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024;
phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx, workspace_size);
if (matmul_key != nullptr) {
auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
size_t sub_key = matmul_key->GetSubKey(
static_cast<int64_t>(MatmulImplType::kImplWithCublasLt));
if (cache.FindSubKey(sub_key)) {
best_algo =
reinterpret_cast<cublasLtMatmulAlgo_t*>(cache.GetSubKey(sub_key));
} else if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune()) {
cublasLtMatmulAlgo_t test_algo;
if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune() &&
(!cache.FindSubKey(sub_key))) {
desc->ValidateCache();
SearchBestAlgo(ctx,
cublaslt_handle,
desc.op_desc,
desc.y_desc,
desc.x_desc,
desc.out_desc,
desc,
static_cast<void*>(&alpha),
static_cast<void*>(&beta),
y_ptr,
x_ptr,
out_ptr,
workspace->ptr(),
workspace_size,
&(test_algo));
cache.SetSubKey(
sub_key,
reinterpret_cast<phi::autotune::MatmulHashValueType*>(&test_algo));
best_algo = &test_algo;
workspace_size);
MatmulDescriptor* best_desc = new MatmulDescriptor(*desc);
VLOG(4) << GetDescResultString("[Searched MatmulDescriptor] ",
best_desc);
cache.SetSubKey(sub_key, reinterpret_cast<void*>(best_desc));
}
}
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmul(
cublaslt_handle,
desc.op_desc,
static_cast<void*>(&alpha),
y_ptr,
desc.y_desc,
x_ptr,
desc.x_desc,
static_cast<void*>(&beta),
out_ptr,
desc.out_desc,
out_ptr,
desc.out_desc,
reinterpret_cast<cublasLtMatmulAlgo_t*>(best_algo),
workspace->ptr(),
workspace_size,
ctx.stream()));
VLOG(4) << GetDescResultString("[Impl MatmulDescriptor] ", desc);
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cublasLtMatmul(cublaslt_handle,
desc->op_desc,
static_cast<void*>(&alpha),
y_ptr,
desc->y_desc,
x_ptr,
desc->x_desc,
static_cast<void*>(&beta),
out_ptr,
desc->out_desc,
out_ptr,
desc->out_desc,
desc->algo,
workspace->ptr(),
workspace_size,
ctx.stream()));
}
static void SearchBestAlgo(const phi::GPUContext& ctx,
const cublasLtHandle_t& lt_handle,
const cublasLtMatmulDesc_t& op_desc,
const cublasLtMatrixLayout_t& y_desc,
const cublasLtMatrixLayout_t& x_desc,
const cublasLtMatrixLayout_t& out_desc,
MatmulDescriptor* desc,
const void* alpha,
const void* beta,
const void* y_data,
const void* x_data,
void* out_data,
void* workspace_ptr,
size_t workspace_size,
cublasLtMatmulAlgo_t* best_algo) {
size_t workspace_size) {
cublasLtMatmulAlgo_t* best_algo = desc->SetAlgo();
const auto& stream = ctx.stream();
int returned_results = 0;
constexpr int requested_algo_count = 10;
cublasLtMatmulPreference_t preference;
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cublasLtMatmulPreferenceCreate(&preference));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulPreferenceSetAttribute(
......@@ -300,16 +391,15 @@ struct MatmulWithCublasLt {
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&workspace_size,
sizeof(workspace_size)));
std::vector<cublasLtMatmulHeuristicResult_t> heuristic_results(
requested_algo_count);
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cublasLtMatmulAlgoGetHeuristic(lt_handle,
op_desc,
y_desc,
x_desc,
out_desc,
out_desc,
desc->op_desc,
desc->y_desc,
desc->x_desc,
desc->out_desc,
desc->out_desc,
preference,
requested_algo_count,
heuristic_results.data(),
......@@ -317,7 +407,6 @@ struct MatmulWithCublasLt {
PADDLE_ENFORCE_GT(returned_results,
0,
phi::errors::Unavailable("No GEMM algorithm avaliable."));
phi::GpuTimer timer;
int best_algo_idx = -1;
constexpr int repeats = 6;
......@@ -329,17 +418,17 @@ struct MatmulWithCublasLt {
timer.Start(stream);
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cublasLtMatmul(lt_handle,
op_desc,
desc->op_desc,
alpha,
y_data,
y_desc,
desc->y_desc,
x_data,
x_desc,
desc->x_desc,
beta,
out_data,
out_desc,
desc->out_desc,
out_data,
out_desc,
desc->out_desc,
&(heuristic_results[algo_idx].algo),
workspace_ptr,
workspace_size,
......@@ -360,7 +449,6 @@ struct MatmulWithCublasLt {
}
}
VLOG(4) << "Best_algo_idx in MatmulWithCublaslt is : " << best_algo_idx;
*best_algo = heuristic_results[best_algo_idx].algo;
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cublasLtMatmulPreferenceDestroy(preference));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册