未验证 提交 819f8939 编写于 作者: L limingshu 提交者: GitHub

Cache for cublaslt descriptor (#50931)

* first commit

* finish base work

* modification for good

* fix for cache setting and gather the algo and desc as one data for cache storage

* fix for cache setting and gather the algo and desc as one data for cache storage

* install pre-commit check
上级 25d3ed65
...@@ -60,10 +60,6 @@ size_t GenKey(Args&&... args) { ...@@ -60,10 +60,6 @@ size_t GenKey(Args&&... args) {
return seed; return seed;
} }
struct MatmulHashValueType {
uint64_t data[8];
};
struct MatmulCacheKey { struct MatmulCacheKey {
public: public:
MatmulCacheKey() {} MatmulCacheKey() {}
...@@ -79,7 +75,7 @@ struct MatmulCacheKey { ...@@ -79,7 +75,7 @@ struct MatmulCacheKey {
static_cast<int64_t>(dtype)); static_cast<int64_t>(dtype));
} }
size_t GetKey() const { return key; } size_t GetKey() const { return key; }
size_t GetSubKey(int64_t idx) const { return GenKey(key, idx); } size_t GenSubKey(int64_t idx) const { return GenKey(key, idx); }
private: private:
size_t key; size_t key;
...@@ -249,22 +245,22 @@ class MatmulAlgorithmsCache : public AlgorithmsCache<KeyT, AlgorithmT> { ...@@ -249,22 +245,22 @@ class MatmulAlgorithmsCache : public AlgorithmsCache<KeyT, AlgorithmT> {
return ret; return ret;
} }
void SetSubKey(const KeyT& sub_key, const MatmulHashValueType* algo) { void SetSubKey(const KeyT& sub_key, void* algo) {
std::lock_guard<std::mutex> lock(*(this->cache_mutex_)); std::lock_guard<std::mutex> lock(*(this->cache_mutex_));
sub_hash_[sub_key] = *algo; sub_hash_[sub_key] = algo;
} }
MatmulHashValueType* GetSubKey(const KeyT& sub_key) { void* GetSubKey(const KeyT& sub_key) {
std::lock_guard<std::mutex> lock(*(this->cache_mutex_)); std::lock_guard<std::mutex> lock(*(this->cache_mutex_));
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
sub_hash_.find(sub_key), sub_hash_.find(sub_key),
sub_hash_.end(), sub_hash_.end(),
phi::errors::PreconditionNotMet("The key does not exist.")); phi::errors::PreconditionNotMet("The key does not exist."));
return &(sub_hash_[sub_key]); return sub_hash_[sub_key];
} }
private: private:
std::unordered_map<KeyT, MatmulHashValueType> sub_hash_; std::unordered_map<KeyT, void*> sub_hash_;
}; };
} // namespace autotune } // namespace autotune
......
...@@ -26,7 +26,7 @@ limitations under the License. */ ...@@ -26,7 +26,7 @@ limitations under the License. */
namespace phi { namespace phi {
namespace funcs { namespace funcs {
enum MatmulImplType { kImplWithCublas = 1, kImplWithCublasLt = 2 }; enum MatmulImplType { kCublas = 1, kCublasLt = 2 };
template <typename T> template <typename T>
cublasComputeType_t GetCudaComputeType() { cublasComputeType_t GetCudaComputeType() {
...@@ -43,7 +43,35 @@ struct MatmulDescriptor { ...@@ -43,7 +43,35 @@ struct MatmulDescriptor {
cublasLtMatrixLayout_t x_desc{nullptr}; cublasLtMatrixLayout_t x_desc{nullptr};
cublasLtMatrixLayout_t y_desc{nullptr}; cublasLtMatrixLayout_t y_desc{nullptr};
cublasLtMatrixLayout_t out_desc{nullptr}; cublasLtMatrixLayout_t out_desc{nullptr};
cublasLtMatmulAlgo_t* algo{nullptr};
MatmulDescriptor() {}
MatmulDescriptor(const MatmulDescriptor& obj) {
algo = obj.algo;
x_desc = obj.x_desc;
y_desc = obj.y_desc;
op_desc = obj.op_desc;
out_desc = obj.out_desc;
}
~MatmulDescriptor() {
if (!is_cached) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescDestroy(op_desc));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(y_desc));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(x_desc));
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cublasLtMatrixLayoutDestroy(out_desc));
delete algo;
op_desc = nullptr;
x_desc = nullptr;
y_desc = nullptr;
out_desc = nullptr;
algo = nullptr;
}
}
// x_desc, y_desc, op_desc are allocated in heap memory.
template <typename T> template <typename T>
void Create(const int M, void Create(const int M,
const int N, const int N,
...@@ -90,6 +118,15 @@ struct MatmulDescriptor { ...@@ -90,6 +118,15 @@ struct MatmulDescriptor {
} }
} }
cublasLtMatmulAlgo_t* SetAlgo() {
algo = new cublasLtMatmulAlgo_t;
return algo;
}
void ValidateCache() { is_cached = true; }
private:
bool is_cached{false};
void CreateMatrixLayout(cublasLtMatrixLayout_t* desc, void CreateMatrixLayout(cublasLtMatrixLayout_t* desc,
cudaDataType type, cudaDataType type,
uint64_t rows, uint64_t rows,
...@@ -118,17 +155,73 @@ struct MatmulDescriptor { ...@@ -118,17 +155,73 @@ struct MatmulDescriptor {
&stride, &stride,
sizeof(stride))); sizeof(stride)));
} }
};
void Release() { inline std::string GetDescResultString(std::string prefix,
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(y_desc)); const MatmulDescriptor* desc,
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(x_desc)); bool has_algo = true) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(out_desc)); std::ostringstream out;
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescDestroy(op_desc)); out << prefix << " \n";
#define GET_DESC_DATA_INFO(src) \
do { \
out << "#data " \
<< "= ["; \
int num = sizeof((*src)) / sizeof(src->data[0]); \
for (int i = 0; i < num; ++i) { \
out << src->data[i] << ", "; \
} \
out << "]\n"; \
} while (0);
if (has_algo) {
GET_DESC_DATA_INFO(desc->algo);
}
GET_DESC_DATA_INFO(desc->x_desc);
GET_DESC_DATA_INFO(desc->y_desc);
GET_DESC_DATA_INFO(desc->out_desc);
GET_DESC_DATA_INFO(desc->op_desc);
return out.str();
}
op_desc = nullptr; template <typename T>
x_desc = nullptr; struct DescriptorSetter {
y_desc = nullptr; MatmulDescriptor* desc{nullptr};
out_desc = nullptr; size_t sub_key{std::numeric_limits<size_t>::min()};
DescriptorSetter(phi::autotune::MatmulCacheKey* matmul_key,
MatmulDescriptor* desc_ptr,
const int M,
const int N,
const int K,
const bool trans_x,
const bool trans_y,
const int batch_size = 1,
int64_t stride_x = 0,
int64_t stride_y = 0,
int64_t stride_out = 0) {
if (matmul_key != nullptr) {
sub_key =
matmul_key->GenSubKey(static_cast<size_t>(MatmulImplType::kCublasLt));
}
auto& mamtul_cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
if (mamtul_cache.FindSubKey(sub_key)) {
desc =
reinterpret_cast<MatmulDescriptor*>(mamtul_cache.GetSubKey(sub_key));
VLOG(4) << GetDescResultString("[Heap MatmulDescriptor] ", desc);
} else {
desc_ptr->Create<T>(M,
N,
K,
trans_x,
trans_y,
batch_size,
stride_x,
stride_y,
stride_out);
desc = desc_ptr;
VLOG(4) << GetDescResultString("[Stack MatmulDescriptor] ", desc, false);
}
} }
}; };
...@@ -148,9 +241,10 @@ struct MatmulWithCublasLt { ...@@ -148,9 +241,10 @@ struct MatmulWithCublasLt {
const bool trans_y, const bool trans_y,
phi::autotune::MatmulCacheKey* matmul_key = nullptr) { phi::autotune::MatmulCacheKey* matmul_key = nullptr) {
MatmulDescriptor desc; MatmulDescriptor desc;
desc.Create<T>(M, N, K, trans_x, trans_y); auto setter =
RunImpl(ctx, desc, x_data, y_data, out_data, matmul_key); DescriptorSetter<T>(matmul_key, &desc, M, N, K, trans_x, trans_y);
desc.Release(); RunImpl(
ctx, setter.desc, x_data, y_data, out_data, setter.sub_key, matmul_key);
} }
static void RunWithBatch( static void RunWithBatch(
...@@ -169,10 +263,19 @@ struct MatmulWithCublasLt { ...@@ -169,10 +263,19 @@ struct MatmulWithCublasLt {
int64_t stride_out, int64_t stride_out,
phi::autotune::MatmulCacheKey* matmul_key = nullptr) { phi::autotune::MatmulCacheKey* matmul_key = nullptr) {
MatmulDescriptor desc; MatmulDescriptor desc;
desc.Create<T>( auto setter = DescriptorSetter<T>(matmul_key,
M, N, K, trans_x, trans_y, batch_size, stride_x, stride_y, stride_out); &desc,
RunImpl(ctx, desc, x_data, y_data, out_data, matmul_key); M,
desc.Release(); N,
K,
trans_x,
trans_y,
batch_size,
stride_x,
stride_y,
stride_out);
RunImpl(
ctx, setter.desc, x_data, y_data, out_data, setter.sub_key, matmul_key);
} }
static void RunWithBatch( static void RunWithBatch(
...@@ -211,88 +314,76 @@ struct MatmulWithCublasLt { ...@@ -211,88 +314,76 @@ struct MatmulWithCublasLt {
} }
static void RunImpl(const phi::GPUContext& ctx, static void RunImpl(const phi::GPUContext& ctx,
const MatmulDescriptor& desc, MatmulDescriptor* desc,
const T* x_ptr, const T* x_ptr,
const T* y_ptr, const T* y_ptr,
T* out_ptr, T* out_ptr,
const size_t sub_key,
phi::autotune::MatmulCacheKey* matmul_key = nullptr) { phi::autotune::MatmulCacheKey* matmul_key = nullptr) {
MT alpha = static_cast<MT>(1); MT alpha = static_cast<MT>(1);
MT beta = static_cast<MT>(0); MT beta = static_cast<MT>(0);
cublasLtHandle_t cublaslt_handle = ctx.cublaslt_handle(); cublasLtHandle_t cublaslt_handle = ctx.cublaslt_handle();
cublasLtMatmulAlgo_t* best_algo = nullptr;
size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024; size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024;
phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx, workspace_size); phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx, workspace_size);
if (matmul_key != nullptr) { if (matmul_key != nullptr) {
auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul(); auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
size_t sub_key = matmul_key->GetSubKey( if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune() &&
static_cast<int64_t>(MatmulImplType::kImplWithCublasLt)); (!cache.FindSubKey(sub_key))) {
if (cache.FindSubKey(sub_key)) { desc->ValidateCache();
best_algo =
reinterpret_cast<cublasLtMatmulAlgo_t*>(cache.GetSubKey(sub_key));
} else if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune()) {
cublasLtMatmulAlgo_t test_algo;
SearchBestAlgo(ctx, SearchBestAlgo(ctx,
cublaslt_handle, cublaslt_handle,
desc.op_desc, desc,
desc.y_desc,
desc.x_desc,
desc.out_desc,
static_cast<void*>(&alpha), static_cast<void*>(&alpha),
static_cast<void*>(&beta), static_cast<void*>(&beta),
y_ptr, y_ptr,
x_ptr, x_ptr,
out_ptr, out_ptr,
workspace->ptr(), workspace->ptr(),
workspace_size, workspace_size);
&(test_algo)); MatmulDescriptor* best_desc = new MatmulDescriptor(*desc);
cache.SetSubKey( VLOG(4) << GetDescResultString("[Searched MatmulDescriptor] ",
sub_key, best_desc);
reinterpret_cast<phi::autotune::MatmulHashValueType*>(&test_algo)); cache.SetSubKey(sub_key, reinterpret_cast<void*>(best_desc));
best_algo = &test_algo;
} }
} }
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmul( VLOG(4) << GetDescResultString("[Impl MatmulDescriptor] ", desc);
cublaslt_handle, PADDLE_ENFORCE_GPU_SUCCESS(
desc.op_desc, dynload::cublasLtMatmul(cublaslt_handle,
static_cast<void*>(&alpha), desc->op_desc,
y_ptr, static_cast<void*>(&alpha),
desc.y_desc, y_ptr,
x_ptr, desc->y_desc,
desc.x_desc, x_ptr,
static_cast<void*>(&beta), desc->x_desc,
out_ptr, static_cast<void*>(&beta),
desc.out_desc, out_ptr,
out_ptr, desc->out_desc,
desc.out_desc, out_ptr,
reinterpret_cast<cublasLtMatmulAlgo_t*>(best_algo), desc->out_desc,
workspace->ptr(), desc->algo,
workspace_size, workspace->ptr(),
ctx.stream())); workspace_size,
ctx.stream()));
} }
static void SearchBestAlgo(const phi::GPUContext& ctx, static void SearchBestAlgo(const phi::GPUContext& ctx,
const cublasLtHandle_t& lt_handle, const cublasLtHandle_t& lt_handle,
const cublasLtMatmulDesc_t& op_desc, MatmulDescriptor* desc,
const cublasLtMatrixLayout_t& y_desc,
const cublasLtMatrixLayout_t& x_desc,
const cublasLtMatrixLayout_t& out_desc,
const void* alpha, const void* alpha,
const void* beta, const void* beta,
const void* y_data, const void* y_data,
const void* x_data, const void* x_data,
void* out_data, void* out_data,
void* workspace_ptr, void* workspace_ptr,
size_t workspace_size, size_t workspace_size) {
cublasLtMatmulAlgo_t* best_algo) { cublasLtMatmulAlgo_t* best_algo = desc->SetAlgo();
const auto& stream = ctx.stream(); const auto& stream = ctx.stream();
int returned_results = 0; int returned_results = 0;
constexpr int requested_algo_count = 10; constexpr int requested_algo_count = 10;
cublasLtMatmulPreference_t preference; cublasLtMatmulPreference_t preference;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cublasLtMatmulPreferenceCreate(&preference)); dynload::cublasLtMatmulPreferenceCreate(&preference));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulPreferenceSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulPreferenceSetAttribute(
...@@ -300,16 +391,15 @@ struct MatmulWithCublasLt { ...@@ -300,16 +391,15 @@ struct MatmulWithCublasLt {
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&workspace_size, &workspace_size,
sizeof(workspace_size))); sizeof(workspace_size)));
std::vector<cublasLtMatmulHeuristicResult_t> heuristic_results( std::vector<cublasLtMatmulHeuristicResult_t> heuristic_results(
requested_algo_count); requested_algo_count);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cublasLtMatmulAlgoGetHeuristic(lt_handle, dynload::cublasLtMatmulAlgoGetHeuristic(lt_handle,
op_desc, desc->op_desc,
y_desc, desc->y_desc,
x_desc, desc->x_desc,
out_desc, desc->out_desc,
out_desc, desc->out_desc,
preference, preference,
requested_algo_count, requested_algo_count,
heuristic_results.data(), heuristic_results.data(),
...@@ -317,7 +407,6 @@ struct MatmulWithCublasLt { ...@@ -317,7 +407,6 @@ struct MatmulWithCublasLt {
PADDLE_ENFORCE_GT(returned_results, PADDLE_ENFORCE_GT(returned_results,
0, 0,
phi::errors::Unavailable("No GEMM algorithm avaliable.")); phi::errors::Unavailable("No GEMM algorithm avaliable."));
phi::GpuTimer timer; phi::GpuTimer timer;
int best_algo_idx = -1; int best_algo_idx = -1;
constexpr int repeats = 6; constexpr int repeats = 6;
...@@ -329,17 +418,17 @@ struct MatmulWithCublasLt { ...@@ -329,17 +418,17 @@ struct MatmulWithCublasLt {
timer.Start(stream); timer.Start(stream);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cublasLtMatmul(lt_handle, dynload::cublasLtMatmul(lt_handle,
op_desc, desc->op_desc,
alpha, alpha,
y_data, y_data,
y_desc, desc->y_desc,
x_data, x_data,
x_desc, desc->x_desc,
beta, beta,
out_data, out_data,
out_desc, desc->out_desc,
out_data, out_data,
out_desc, desc->out_desc,
&(heuristic_results[algo_idx].algo), &(heuristic_results[algo_idx].algo),
workspace_ptr, workspace_ptr,
workspace_size, workspace_size,
...@@ -360,7 +449,6 @@ struct MatmulWithCublasLt { ...@@ -360,7 +449,6 @@ struct MatmulWithCublasLt {
} }
} }
VLOG(4) << "Best_algo_idx in MatmulWithCublaslt is : " << best_algo_idx; VLOG(4) << "Best_algo_idx in MatmulWithCublaslt is : " << best_algo_idx;
*best_algo = heuristic_results[best_algo_idx].algo; *best_algo = heuristic_results[best_algo_idx].algo;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cublasLtMatmulPreferenceDestroy(preference)); dynload::cublasLtMatmulPreferenceDestroy(preference));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册