未验证 提交 46980d68 编写于 作者: W Wilber 提交者: GitHub

add ltgemm gemv. test=develop (#4155)

上级 89cfa8e6
......@@ -21,6 +21,10 @@
#include <cudnn.h>
#include "lite/utils/cp_logging.h"
#if (CUBLAS_VER_MAJOR * 10 + CUBLAS_VER_MINOR) >= 101
#include <cublasLt.h>
#endif
/*
* This file contains some CUDA specific utils.
*/
......
......@@ -16,6 +16,7 @@ nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
nv_library(cuda_gru_forward SRCS gru_forward.cu DEPS cuda_activation ${cuda_static_deps})
nv_library(cuda_sequence2batch SRCS sequence2batch.cu DEPS ${cuda_static_deps})
nv_library(cuda_gemm SRCS gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_gemv SRCS gemv.cc DEPS ${cuda_static_deps})
nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_strided_gemm SRCS strided_gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps})
......@@ -35,6 +36,7 @@ set (
cuda_gru_forward
cuda_sequence2batch
cuda_gemm
cuda_gemv
cuda_batched_gemm
cuda_strided_gemm
cuda_sequence_padding
......
......@@ -123,6 +123,156 @@ bool Gemm<half, half>::run(const half alpha,
template class Gemm<float, float>;
template class Gemm<half, half>;
// LtGemm
template <typename T>
class cublasTypeWrapper;
template <>
class cublasTypeWrapper<float> {
public:
static const cudaDataType_t type = CUDA_R_32F;
};
template <>
class cublasTypeWrapper<half> {
public:
static const cudaDataType_t type = CUDA_R_16F;
};
#if (CUBLAS_VER_MAJOR * 10 + CUBLAS_VER_MINOR) >= 101
template <typename PTypeIn, typename PTypeOut>
bool LtGemm<PTypeIn, PTypeOut>::init(const bool trans_a,
const bool trans_b,
const int m,
const int n,
const int k,
Context<TARGET(kCUDA)> *ctx) {
int lda = (!trans_a) ? k : m;
int ldb = (!trans_b) ? n : k;
int ldc = n;
return this->init(trans_a, trans_b, m, n, k, lda, ldb, ldc, ctx);
}
template <typename PTypeIn, typename PTypeOut>
bool LtGemm<PTypeIn, PTypeOut>::init(const bool trans_a,
const bool trans_b,
const int m,
const int n,
const int k,
const int lda,
const int ldb,
const int ldc,
Context<TARGET(kCUDA)> *ctx) {
if (handle_ == nullptr) {
this->exe_stream_ = ctx->exec_stream();
CUBLAS_CALL(cublasLtCreate(&handle_));
}
m_ = m;
n_ = n;
k_ = k;
lda_ = lda;
ldb_ = ldb;
ldc_ = ldc;
cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
// create operation desciriptor; see cublasLtMatmulDescAttributes_t for
// details about defaults; here we just need to set the transforms for A and B
CUBLAS_CALL(cublasLtMatmulDescCreate(&matmul_desc_,
cublasTypeWrapper<PTypeOut>::type));
CUBLAS_CALL(cublasLtMatmulDescSetAttribute(matmul_desc_,
CUBLASLT_MATMUL_DESC_TRANSA,
&cu_trans_b_,
sizeof(cu_trans_b_)));
CUBLAS_CALL(cublasLtMatmulDescSetAttribute(matmul_desc_,
CUBLASLT_MATMUL_DESC_TRANSA,
&cu_trans_a_,
sizeof(cu_trans_a_)));
// create matrix descriptors, we are good with the details here so no need to
// set any extra attributes
CUBLAS_CALL(cublasLtMatrixLayoutCreate(&a_desc_,
cublasTypeWrapper<PTypeOut>::type,
trans_a == false ? k : m,
trans_a == false ? m : k,
lda));
CUBLAS_CALL(cublasLtMatrixLayoutCreate(&b_desc_,
cublasTypeWrapper<PTypeOut>::type,
trans_b == false ? n : k,
trans_b == false ? k : n,
ldb));
CUBLAS_CALL(cublasLtMatrixLayoutCreate(
&c_desc_, cublasTypeWrapper<PTypeOut>::type, n, m, ldc));
// create preference handle; here we could use extra attributes to disable
// tensor ops or to make sure algo selected will work with badly aligned A, B,
// C; here for simplicity we just assume A,B,C are always well aligned (e.g.
// directly come from cudaMalloc)
CUBLAS_CALL(cublasLtMatmulPreferenceCreate(&preference_));
if (!workspace_) {
CUDA_CALL(cudaMalloc(&this->workspace_, workspace_size_));
}
CUBLAS_CALL(cublasLtMatmulPreferenceSetAttribute(
preference_,
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&workspace_size_,
sizeof(workspace_size_)));
// we just need the best available heuristic to try and run matmul. There is
// no guarantee this will work, e.g. if A is badly aligned, you can request
// more (e.g. 32) algos and try to run them one by one until something works
CUBLAS_CALL(cublasLtMatmulAlgoGetHeuristic(handle_,
matmul_desc_,
b_desc_,
a_desc_,
c_desc_,
c_desc_,
preference_,
1,
&heuristic_result_,
&returned_results_));
if (returned_results_ == 0) {
LOG(FATAL) << "cuBLAS API failed with status "
<< CUBLAS_STATUS_NOT_SUPPORTED;
}
return true;
}
template <typename PTypeIn, typename PTypeOut>
bool LtGemm<PTypeIn, PTypeOut>::run(const PTypeOut alpha,
const PTypeOut beta,
const PTypeIn *a,
const PTypeIn *b,
PTypeOut *c,
Context<TARGET(kCUDA)> *ctx) {
CUBLAS_CALL(cublasLtMatmul(handle_,
matmul_desc_,
&alpha,
b,
b_desc_,
a,
a_desc_,
&beta,
c,
c_desc_,
c,
c_desc_,
&heuristic_result_.algo,
workspace_,
workspace_size_,
this->exe_stream_));
return true;
}
template class LtGemm<float, float>;
template class LtGemm<half, half>;
#endif
} // namespace math
} // namespace cuda
} // namespace lite
......
......@@ -13,7 +13,6 @@
// limitations under the License.
#pragma once
#include <cudnn.h>
#include <string>
#include <vector>
#include "lite/api/paddle_place.h"
......@@ -70,6 +69,79 @@ class Gemm {
int ldc_{-1};
};
#if (CUBLAS_VER_MAJOR * 10 + CUBLAS_VER_MINOR) >= 101
template <typename PtypeIn, typename PtypeOut>
class LtGemm {
public:
LtGemm()
: handle_(nullptr),
matmul_desc_(nullptr),
a_desc_(nullptr),
b_desc_(nullptr),
c_desc_(nullptr),
preference_(nullptr),
returned_results_(0),
workspace_size_(4 * 1024 * 1024),
workspace_{nullptr} {}
~LtGemm() {
if (this->workspace_) {
CUDA_CALL(cudaFree(this->workspace_));
}
this->workspace_ = nullptr;
}
bool init(const bool trans_a,
const bool trans_b,
const int m,
const int n,
const int k,
Context<TARGET(kCUDA)>* ctx);
bool init(const bool trans_a,
const bool trans_b,
const int m,
const int n,
const int k,
const int lda,
const int ldb,
const int ldc,
Context<TARGET(kCUDA)>* ctx);
bool run(const PtypeOut alpha,
const PtypeOut beta,
const PtypeIn* a,
const PtypeIn* b,
PtypeOut* c,
Context<TARGET(kCUDA)>* ctx);
cublasLtHandle_t get_handle() const { return handle_; }
private:
cudaStream_t exe_stream_;
cublasLtHandle_t handle_;
cublasLtMatmulDesc_t matmul_desc_;
cublasLtMatrixLayout_t a_desc_;
cublasLtMatrixLayout_t b_desc_;
cublasLtMatrixLayout_t c_desc_;
cublasLtMatmulPreference_t preference_;
int returned_results_;
cublasLtMatmulHeuristicResult_t heuristic_result_{};
cublasOperation_t cu_trans_a_;
cublasOperation_t cu_trans_b_;
int m_{-1};
int n_{-1};
int k_{-1};
int lda_{-1};
int ldb_{-1};
int ldc_{-1};
size_t workspace_size_;
void* workspace_;
};
#endif
} // namespace math
} // namespace cuda
} // namespace lite
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/cuda/math/gemv.h"
#include <iostream>
#include "lite/core/device_info.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename PTypeIn, typename PTypeOut>
bool Gemv<PTypeIn, PTypeOut>::init(const bool trans,
const int m,
const int n,
const int lda,
const int ldb,
const int ldc,
Context<TARGET(kCUDA)> *ctx) {
if (cu_handle_ == nullptr) {
this->exe_stream_ = ctx->exec_stream();
CUBLAS_CALL(cublasCreate(&cu_handle_));
CUBLAS_CALL(cublasSetMathMode(cu_handle_, CUBLAS_TENSOR_OP_MATH));
CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
}
m_ = m;
n_ = n;
lda_ = lda;
ldb_ = ldb;
ldc_ = ldc;
cu_trans_ = trans ? CUBLAS_OP_N : CUBLAS_OP_T;
return true;
}
template <>
bool Gemv<float, float>::run(const float alpha,
const float beta,
const float *a,
const float *b,
float *c) {
CUBLAS_CALL(cublasSgemv(
cu_handle_, cu_trans_, n_, m_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_));
return true;
}
template <>
bool Gemv<half, half>::run(
const half alpha, const half beta, const half *a, const half *b, half *c) {
LOG(FATAL) << "not supported";
return false;
}
template class Gemv<float, float>;
template class Gemv<half, half>;
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cudnn.h>
#include <string>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/context.h"
#include "lite/core/target_wrapper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename PtypeIn, typename PtypeOut>
class Gemv {
public:
Gemv() : cu_handle_(nullptr) {}
~Gemv() {}
bool init(const bool trans_,
const int m,
const int n,
const int lda,
const int ldb,
const int ldc,
Context<TARGET(kCUDA)>* ctx);
bool run(const PtypeOut alpha,
const PtypeOut beta,
const PtypeIn* a,
const PtypeIn* b,
PtypeOut* c);
cublasHandle_t get_handle() const { return cu_handle_; }
private:
cudaStream_t exe_stream_;
cublasHandle_t cu_handle_;
cublasOperation_t cu_trans_;
int m_{-1};
int n_{-1};
int lda_{-1};
int ldb_{-1};
int ldc_{-1};
};
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "lite/kernels/cuda/mul_compute.h"
#include "lite/core/op_registry.h"
namespace paddle {
......@@ -23,9 +24,18 @@ namespace cuda {} // namespace cuda
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
mul, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::MulCompute, def)
using MulFp32 =
paddle::lite::kernels::cuda::MulCompute<float, PRECISION(kFloat)>;
using MulFp16 = paddle::lite::kernels::cuda::MulCompute<half, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, MulFp32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
REGISTER_LITE_KERNEL(mul, kCUDA, kFP16, kNCHW, MulFp16, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
.Finalize();
......@@ -23,22 +23,21 @@ namespace lite {
namespace kernels {
namespace cuda {
class MulCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
template <typename T, PrecisionType PType>
class MulCompute : public KernelLite<TARGET(kCUDA), PType> {
public:
using param_t = operators::MulParam;
void PrepareForRun() override {
gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
gemm_impl_.reset(new lite::cuda::math::Gemm<T, T>);
}
void Run() override {
CHECK(ctx_) << "running context should be set first";
auto& context = this->ctx_->template As<CUDAContext>();
auto& param = this->Param<param_t>();
const auto* x_data = param.x->data<float>();
const auto* y_data = param.y->data<float>();
auto* out_data = param.output->mutable_data<float>(TARGET(kCUDA));
auto& param = this->template Param<param_t>();
const auto* x_data = param.x->template data<T>();
const auto* y_data = param.y->template data<T>();
auto* out_data = param.output->template mutable_data<T>(TARGET(kCUDA));
int x_h = static_cast<int>(
param.x->dims().Slice(0, param.x_num_col_dims).production());
......@@ -61,7 +60,7 @@ class MulCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
virtual ~MulCompute() = default;
private:
std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_{nullptr};
std::unique_ptr<lite::cuda::math::Gemm<T, T>> gemm_impl_{nullptr};
};
} // namespace cuda
......
......@@ -16,58 +16,182 @@
#include <gtest/gtest.h>
#include <memory>
#include <utility>
#include "lite/backends/cuda/blas.h"
#include <vector>
#include "lite/api/test_helper.h"
#include "lite/utils/float16.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
TEST(mul_compute, normal) {
MulCompute mul_kernel;
std::unique_ptr<KernelContext> ctx(new KernelContext);
auto& context = ctx->As<CUDAContext>();
Tensor x, y, out, x_cpu, y_cpu, out_cpu;
int x_h = 2, x_w_y_h = 3, y_w = 4;
out.Resize({x_h, y_w});
x_cpu.Resize({x_h, x_w_y_h});
y_cpu.Resize({x_w_y_h, y_w});
out_cpu.Resize({x_h, y_w});
auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
float* x_cpu_data = x_cpu.mutable_data<float>();
float* y_cpu_data = y_cpu.mutable_data<float>();
float* out_cpu_data = out_cpu.mutable_data<float>();
for (int i = 0; i < x_cpu.numel(); i++) {
x_cpu_data[i] = i + 1.0;
class MulTest : public ::testing::Test {
protected:
MulTest()
: m_(2),
k_(3),
n_(4),
x_shape_({m_, k_}),
y_shape_({k_, n_}),
out_shape_({m_, n_}) {
x_gpu_.Resize(lite::DDim(x_shape_));
x_ref_.Resize(lite::DDim(x_shape_));
y_gpu_.Resize(lite::DDim(y_shape_));
y_ref_.Resize(lite::DDim(y_shape_));
auto x_ref_data = x_ref_.mutable_data<float>();
auto y_ref_data = y_ref_.mutable_data<float>();
// prepare input
for (int64_t i = 0; i < x_ref_.numel(); i++) {
x_ref_data[i] = static_cast<float>(i % 10 * 0.2);
}
for (int64_t i = 0; i < y_ref_.numel(); i++) {
y_ref_data[i] = static_cast<float>(i % 10 * 0.2);
}
out_ref_.Resize(lite::DDim(out_shape_));
out_cpu_.Resize(lite::DDim(out_shape_));
out_gpu_.Resize(lite::DDim(out_shape_));
RunBaseLine(&x_ref_, &y_ref_, &out_ref_);
InitParamAndContext();
}
void InitParamAndContext() {
ctx_.reset(new KernelContext);
cudaStreamCreate(&stream_);
auto& context = ctx_->As<CUDAContext>();
context.SetExecStream(stream_);
param_.x = &x_gpu_;
param_.y = &y_gpu_;
param_.output = &out_gpu_;
}
void InitFloatInput() {
x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
x_gpu_.dims());
y_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(y_ref_.data<float>(),
y_gpu_.dims());
}
void InitHalfInput() {
x_half_.Resize(lite::DDim(x_ref_.dims()));
auto x_half_data = x_half_.mutable_data<half>();
for (int64_t i = 0; i < x_half_.numel(); i++) {
x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
}
x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
y_half_.Resize(y_ref_.dims());
auto y_half_data = y_half_.mutable_data<half>();
for (int64_t i = 0; i < y_half_.numel(); i++) {
y_half_data[i] = half(lite::float16(y_ref_.data<float>()[i]));
}
y_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(y_half_data, y_gpu_.dims());
}
void RunBaseLine(const lite::Tensor* x,
const lite::Tensor* w,
lite::Tensor* out) {
const float* data_in = x->data<float>();
const float* weights = w->data<float>();
float* data_out = out->mutable_data<float>();
int out_rows = x->dims()[0];
int in_cols = x->numel() / out_rows;
int out_cols = w->numel() / in_cols;
int index_out;
for (int i = 0; i < out_rows; i++) {
for (int j = 0; j < out_cols; j++) {
index_out = i * out_cols + j;
data_out[index_out] = 0;
for (int k = 0; k < in_cols; k++) {
data_out[index_out] +=
data_in[i * in_cols + k] * weights[k * out_cols + j];
}
}
}
}
for (int i = 0; i < y_cpu.numel(); i++) {
y_cpu_data[i] = i + 1.0;
int m_, k_, n_;
std::vector<int64_t> x_shape_, y_shape_, out_shape_;
lite::Tensor x_ref_, y_ref_, out_ref_;
lite::Tensor x_gpu_, y_gpu_;
lite::Tensor x_half_, y_half_;
lite::Tensor out_cpu_, out_gpu_;
operators::MulParam param_;
std::unique_ptr<KernelContext> ctx_;
cudaStream_t stream_;
};
TEST_F(MulTest, TestFP32) {
InitFloatInput();
MulCompute<float, PRECISION(kFloat)> mul_kernel;
mul_kernel.SetParam(param_);
mul_kernel.SetContext(std::move(ctx_));
for (int i = 0; i < FLAGS_warmup; ++i) {
mul_kernel.Launch();
cudaDeviceSynchronize();
}
x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
auto start = GetCurrentUS();
mul_kernel.PrepareForRun();
for (int i = 0; i < FLAGS_repeats; ++i) {
mul_kernel.Run();
}
cudaDeviceSynchronize();
auto duration = (GetCurrentUS() - start) / 1000.0;
LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
operators::MulParam param;
param.x = &x;
param.y = &y;
param.output = &out;
mul_kernel.SetParam(param);
CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
out_gpu_.data<float>(),
sizeof(float) * out_gpu_.numel(),
IoDirection::DtoH);
cudaStream_t stream;
cudaStreamCreate(&stream);
context.SetExecStream(stream);
for (int i = 0; i < out_gpu_.numel(); ++i) {
float res = out_cpu_.data<float>()[i];
float ref = out_ref_.data<float>()[i];
EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-4);
}
}
TEST_F(MulTest, TestFP16) {
InitHalfInput();
MulCompute<half, PRECISION(kFP16)> mul_kernel;
mul_kernel.SetParam(param_);
mul_kernel.SetContext(std::move(ctx_));
for (int i = 0; i < FLAGS_warmup; ++i) {
mul_kernel.Launch();
cudaDeviceSynchronize();
}
mul_kernel.SetContext(std::move(ctx));
mul_kernel.Launch();
auto start = GetCurrentUS();
mul_kernel.PrepareForRun();
for (int i = 0; i < FLAGS_repeats; ++i) {
mul_kernel.Run();
}
cudaDeviceSynchronize();
auto duration = (GetCurrentUS() - start) / 1000.0;
LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
const half* out_gpu_data = out_gpu_.data<half>();
half* out_cpu_data = out_cpu_.mutable_data<half>();
CopySync<TARGET(kCUDA)>(out_cpu_data,
out_gpu_data,
sizeof(half) * out_gpu_.numel(),
IoDirection::DtoH);
CopySync<TARGET(kCUDA)>(
out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
for (int i = 0; i < out_cpu.numel(); i++) {
LOG(INFO) << out_cpu_data[i];
for (int i = 0; i < out_cpu_.numel(); ++i) {
float res = static_cast<float>(lite::float16(out_cpu_data[i]));
float ref = out_ref_.data<float>()[i];
EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册