未验证 提交 5a55f13b 编写于 作者: W wangzhen38 提交者: GitHub

【code format】Fix cpplint style 4 (#43695)

* cpplint fix 2

* cpplint fix 2

* fix cpplint style 4

* fix cpplint style 4

* fix cpplint style 4

* fix cpplint style 4
上级 75080988
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h"
namespace plf = paddle::platform; namespace plf = paddle::platform;
...@@ -21,73 +20,174 @@ namespace inference { ...@@ -21,73 +20,174 @@ namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin { namespace plugin {
float zero = 0; float zero = 0;
void Ltgemm_int8_linear( void Ltgemm_int8_linear(cublasLtHandle_t ltHandle,
cublasLtHandle_t ltHandle, const int8_t* A, cublasLtMatrixLayout_t Adesc, const int8_t* A,
int8_t* Atransform, cublasLtMatrixLayout_t AtransformDesc, bool transA_, cublasLtMatrixLayout_t Adesc,
const int8_t* B, cublasLtMatrixLayout_t Bdesc, int8_t* Btransform, int8_t* Atransform,
cublasLtMatrixLayout_t BtransformDesc, bool transB_, int8_t* C, cublasLtMatrixLayout_t AtransformDesc,
cublasLtMatrixLayout_t Cdesc, int8_t* Ctransform, bool transA_,
const int8_t* B,
cublasLtMatrixLayout_t Bdesc,
int8_t* Btransform,
cublasLtMatrixLayout_t BtransformDesc,
bool transB_,
int8_t* C,
cublasLtMatrixLayout_t Cdesc,
int8_t* Ctransform,
cublasLtMatrixLayout_t CtransformDesc, cublasLtMatrixLayout_t CtransformDesc,
cublasLtMatrixTransformDesc_t transformDescT, cublasLtMatrixTransformDesc_t transformDescT,
cublasLtMatrixTransformDesc_t transformDescN, cublasLtMatrixTransformDesc_t transformDescN,
cublasLtMatmulDesc_t matmulDesc, void* alpha_scale, void* alpha_zero, cublasLtMatmulDesc_t matmulDesc,
void* alpha_one, void* workspace, cudaStream_t stream) { void* alpha_scale,
void* alpha_zero,
void* alpha_one,
void* workspace,
cudaStream_t stream) {
if (transA_) { if (transA_) {
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle,
ltHandle, transformDescT, alpha_one, A, Adesc, alpha_zero, nullptr, transformDescT,
nullptr, Atransform, AtransformDesc, stream)); alpha_one,
A,
Adesc,
alpha_zero,
nullptr,
nullptr,
Atransform,
AtransformDesc,
stream));
} else { } else {
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle,
ltHandle, transformDescN, alpha_one, A, Adesc, alpha_zero, nullptr, transformDescN,
nullptr, Atransform, AtransformDesc, stream)); alpha_one,
A,
Adesc,
alpha_zero,
nullptr,
nullptr,
Atransform,
AtransformDesc,
stream));
} }
if (transB_) { if (transB_) {
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle,
ltHandle, transformDescN, alpha_one, B, Bdesc, alpha_zero, nullptr, transformDescN,
nullptr, Btransform, BtransformDesc, stream)); alpha_one,
B,
Bdesc,
alpha_zero,
nullptr,
nullptr,
Btransform,
BtransformDesc,
stream));
} else { } else {
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle,
ltHandle, transformDescT, alpha_one, B, Bdesc, alpha_zero, nullptr, transformDescT,
nullptr, Btransform, BtransformDesc, stream)); alpha_one,
B,
Bdesc,
alpha_zero,
nullptr,
nullptr,
Btransform,
BtransformDesc,
stream));
} }
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(ltHandle,
ltHandle, matmulDesc, alpha_scale, Atransform, AtransformDesc, Btransform, matmulDesc,
BtransformDesc, nullptr, Ctransform, CtransformDesc, Ctransform, alpha_scale,
CtransformDesc, nullptr, workspace, 0, stream)); Atransform,
AtransformDesc,
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( Btransform,
ltHandle, transformDescN, alpha_one, Ctransform, CtransformDesc, BtransformDesc,
alpha_zero, nullptr, nullptr, C, Cdesc, stream)); nullptr,
Ctransform,
CtransformDesc,
Ctransform,
CtransformDesc,
nullptr,
workspace,
0,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle,
transformDescN,
alpha_one,
Ctransform,
CtransformDesc,
alpha_zero,
nullptr,
nullptr,
C,
Cdesc,
stream));
} }
void Ltgemm_fp32_linear(cublasLtHandle_t ltHandle, const float* A, void Ltgemm_fp32_linear(cublasLtHandle_t ltHandle,
cublasLtMatrixLayout_t Adesc, const float* B, const float* A,
cublasLtMatrixLayout_t Bdesc, float* C, cublasLtMatrixLayout_t Adesc,
const float* B,
cublasLtMatrixLayout_t Bdesc,
float* C,
cublasLtMatrixLayout_t Cdesc, cublasLtMatrixLayout_t Cdesc,
cublasLtMatmulDesc_t matmulDesc, void* alpha_scale, cublasLtMatmulDesc_t matmulDesc,
void* alpha_zero, void* workspace, void* alpha_scale,
void* alpha_zero,
void* workspace,
cudaStream_t stream) { cudaStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(ltHandle,
ltHandle, matmulDesc, alpha_scale, A, Adesc, B, Bdesc, alpha_zero, C, matmulDesc,
Cdesc, C, Cdesc, nullptr, workspace, 0, stream)); alpha_scale,
A,
Adesc,
B,
Bdesc,
alpha_zero,
C,
Cdesc,
C,
Cdesc,
nullptr,
workspace,
0,
stream));
} }
void Ltgemm_fp16_linear(cublasLtHandle_t ltHandle, const half* A, void Ltgemm_fp16_linear(cublasLtHandle_t ltHandle,
cublasLtMatrixLayout_t Adesc, const half* B, const half* A,
cublasLtMatrixLayout_t Bdesc, half* C, cublasLtMatrixLayout_t Adesc,
const half* B,
cublasLtMatrixLayout_t Bdesc,
half* C,
cublasLtMatrixLayout_t Cdesc, cublasLtMatrixLayout_t Cdesc,
cublasLtMatmulDesc_t matmulDesc, void* alpha_scale, cublasLtMatmulDesc_t matmulDesc,
void* alpha_zero, void* workspace, void* alpha_scale,
void* alpha_zero,
void* workspace,
cudaStream_t stream) { cudaStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(ltHandle,
ltHandle, matmulDesc, alpha_scale, A, Adesc, B, Bdesc, alpha_zero, C, matmulDesc,
Cdesc, C, Cdesc, nullptr, workspace, 0, stream)); alpha_scale,
A,
Adesc,
B,
Bdesc,
alpha_zero,
C,
Cdesc,
C,
Cdesc,
nullptr,
workspace,
0,
stream));
} }
nvinfer1::DataType MatmulPlugin::getOutputDataType( nvinfer1::DataType MatmulPlugin::getOutputDataType(
int index, const nvinfer1::DataType* input_types, int index,
const nvinfer1::DataType* input_types,
int nb_inputs) const TRT_NOEXCEPT { int nb_inputs) const TRT_NOEXCEPT {
return input_types[0]; return input_types[0];
} }
...@@ -123,13 +223,17 @@ nvinfer1::Dims MatmulPlugin::getOutputDimensions( ...@@ -123,13 +223,17 @@ nvinfer1::Dims MatmulPlugin::getOutputDimensions(
} }
bool MatmulPlugin::supportsFormatCombination( bool MatmulPlugin::supportsFormatCombination(
int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t pos,
nvinfer1::PluginTensorDesc const* inOut,
int32_t nbInputs,
int32_t nbOutputs) const TRT_NOEXCEPT { int32_t nbOutputs) const TRT_NOEXCEPT {
PADDLE_ENFORCE_EQ(nbInputs, 2, PADDLE_ENFORCE_EQ(nbInputs,
2,
platform::errors::InvalidArgument("Must have 2 inputs, " platform::errors::InvalidArgument("Must have 2 inputs, "
"but got %d input(s). ", "but got %d input(s). ",
nbInputs)); nbInputs));
PADDLE_ENFORCE_EQ(nbOutputs, getNbOutputs(), PADDLE_ENFORCE_EQ(nbOutputs,
getNbOutputs(),
platform::errors::InvalidArgument("Must have 1 output, " platform::errors::InvalidArgument("Must have 1 output, "
"but got %d output(s). ", "but got %d output(s). ",
nbOutputs)); nbOutputs));
...@@ -192,64 +296,88 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, ...@@ -192,64 +296,88 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
reinterpret_cast<void**>(&Ctransform_), reinterpret_cast<void**>(&Ctransform_),
sizeof(int8_t) * ((m_ + 32 - 1) / 32 * 32) / 32 * ldctransform)); sizeof(int8_t) * ((m_ + 32 - 1) / 32 * 32) / 32 * ldctransform));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(
&Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_, dyl::cublasLtMatrixLayoutCreate(&Adesc_,
cudadataTypeIO,
AopTranspose == CUBLAS_OP_N ? n_ : k_,
AopTranspose == CUBLAS_OP_N ? k_ : n_, AopTranspose == CUBLAS_OP_N ? k_ : n_,
AopTranspose == CUBLAS_OP_N ? n_ : k_)); AopTranspose == CUBLAS_OP_N ? n_ : k_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Adesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), Adesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridea),
sizeof(stridea))); sizeof(stridea)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(
&Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_, dyl::cublasLtMatrixLayoutCreate(&Bdesc_,
cudadataTypeIO,
BopTranspose == CUBLAS_OP_N ? k_ : m_,
BopTranspose == CUBLAS_OP_N ? m_ : k_, BopTranspose == CUBLAS_OP_N ? m_ : k_,
BopTranspose == CUBLAS_OP_N ? k_ : m_)); BopTranspose == CUBLAS_OP_N ? k_ : m_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Bdesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), Bdesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(strideb),
sizeof(strideb))); sizeof(strideb)));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_)); dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Cdesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), Cdesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridec),
sizeof(stridec))); sizeof(stridec)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&AtransformDesc_, cudadataTypeIO, n_, k_, ldatransform)); &AtransformDesc_, cudadataTypeIO, n_, k_, ldatransform));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(AtransformDesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32))); AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&BtransformDesc_, cudadataTypeIO, m_, k_, ldbtransform)); &BtransformDesc_, cudadataTypeIO, m_, k_, ldbtransform));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
BtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(BtransformDesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
BtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL4_4R2_8C, dyl::cublasLtMatrixLayoutSetAttribute(BtransformDesc_,
CUBLASLT_MATRIX_LAYOUT_ORDER,
&COL4_4R2_8C,
sizeof(COL4_4R2_8C))); sizeof(COL4_4R2_8C)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&CtransformDesc_, cudadataTypeIO, n_, m_, ldctransform)); &CtransformDesc_, cudadataTypeIO, n_, m_, ldctransform));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(CtransformDesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32))); CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
...@@ -259,23 +387,33 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, ...@@ -259,23 +387,33 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescCreate( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescCreate(
&transformDescT_, cudaDataTypeS)); &transformDescT_, cudaDataTypeS));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, transformDescT_,
&cudaDataTypeS, sizeof(cudaDataTypeS))); CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS,
sizeof(cudaDataTypeS)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &Transpose, transformDescT_,
CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA,
&Transpose,
sizeof(Transpose))); sizeof(Transpose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, transformDescT_,
&transform_model, sizeof(transform_model))); CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model,
sizeof(transform_model)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescCreate( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescCreate(
&transformDescN_, cudaDataTypeS)); &transformDescN_, cudaDataTypeS));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescN_, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, transformDescN_,
&cudaDataTypeS, sizeof(cudaDataTypeS))); CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS,
sizeof(cudaDataTypeS)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescN_, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, transformDescN_,
&transform_model, sizeof(transform_model))); CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model,
sizeof(transform_model)));
cublasOperation_t ATranspose = CUBLAS_OP_N, BTranspose = CUBLAS_OP_T; cublasOperation_t ATranspose = CUBLAS_OP_N, BTranspose = CUBLAS_OP_T;
cublasLtPointerMode_t matmul_model = cublasLtPointerMode_t matmul_model =
...@@ -289,14 +427,20 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, ...@@ -289,14 +427,20 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
&matmulDesc_, cudaComputeType, cudaDataTypeS)); &matmulDesc_, cudaComputeType, cudaDataTypeS));
#endif #endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &ATranspose, dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_TRANSA,
&ATranspose,
sizeof(ATranspose))); sizeof(ATranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BTranspose, dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_TRANSB,
&BTranspose,
sizeof(BTranspose))); sizeof(BTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_POINTER_MODE,
&matmul_model,
sizeof(matmul_model))); sizeof(matmul_model)));
std::vector<float> alpha_tem(n_, 0); std::vector<float> alpha_tem(n_, 0);
...@@ -305,18 +449,20 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, ...@@ -305,18 +449,20 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
} }
PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
reinterpret_cast<void**>(&alpha_scale_), n_ * sizeof(float))); reinterpret_cast<void**>(&alpha_scale_), n_ * sizeof(float)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_ * sizeof(float), cudaMemcpyAsync(alpha_scale_,
&alpha_tem[0],
n_ * sizeof(float),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
float zero_tem = zero; float zero_tem = zero;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(float))); cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(float)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyAsync(
cudaMemcpyHostToDevice); alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice);
float one_tem = 1; float one_tem = 1;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_one_), sizeof(float))); cudaMalloc(reinterpret_cast<void**>(&alpha_one_), sizeof(float)));
cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float), cudaMemcpyAsync(
cudaMemcpyHostToDevice); alpha_one_, &one_tem, sizeof(float), cudaMemcpyHostToDevice);
} else if (type_ == nvinfer1::DataType::kHALF) { } else if (type_ == nvinfer1::DataType::kHALF) {
cudaDataType_t cudadataTypeIO = CUDA_R_16F; cudaDataType_t cudadataTypeIO = CUDA_R_16F;
cudaDataType_t cudaDataTypeS = CUDA_R_16F; cudaDataType_t cudaDataTypeS = CUDA_R_16F;
...@@ -325,41 +471,57 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, ...@@ -325,41 +471,57 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
#else #else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F; cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F;
#endif #endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(
&Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_, dyl::cublasLtMatrixLayoutCreate(&Adesc_,
cudadataTypeIO,
AopTranspose == CUBLAS_OP_N ? n_ : k_,
AopTranspose == CUBLAS_OP_N ? k_ : n_, AopTranspose == CUBLAS_OP_N ? k_ : n_,
AopTranspose == CUBLAS_OP_N ? n_ : k_)); AopTranspose == CUBLAS_OP_N ? n_ : k_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Adesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), Adesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridea),
sizeof(stridea))); sizeof(stridea)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(
&Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_, dyl::cublasLtMatrixLayoutCreate(&Bdesc_,
cudadataTypeIO,
BopTranspose == CUBLAS_OP_N ? k_ : m_,
BopTranspose == CUBLAS_OP_N ? m_ : k_, BopTranspose == CUBLAS_OP_N ? m_ : k_,
BopTranspose == CUBLAS_OP_N ? k_ : m_)); BopTranspose == CUBLAS_OP_N ? k_ : m_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Bdesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), Bdesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(strideb),
sizeof(strideb))); sizeof(strideb)));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_)); dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Cdesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), Cdesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridec),
sizeof(stridec))); sizeof(stridec)));
cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE; cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
...@@ -372,26 +534,32 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, ...@@ -372,26 +534,32 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
&matmulDesc_, cudaComputeType, cudaDataTypeS)); &matmulDesc_, cudaComputeType, cudaDataTypeS));
#endif #endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose, dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_TRANSA,
&AopTranspose,
sizeof(AopTranspose))); sizeof(AopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose, dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_TRANSB,
&BopTranspose,
sizeof(BopTranspose))); sizeof(BopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_POINTER_MODE,
&matmul_model,
sizeof(matmul_model))); sizeof(matmul_model)));
half alpha_tem = static_cast<half>(alpha_); half alpha_tem = static_cast<half>(alpha_);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_scale_), sizeof(half))); cudaMalloc(reinterpret_cast<void**>(&alpha_scale_), sizeof(half)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half), cudaMemcpyAsync(
cudaMemcpyHostToDevice); alpha_scale_, &alpha_tem, sizeof(half), cudaMemcpyHostToDevice);
half zero_tem = static_cast<half>(zero); half zero_tem = static_cast<half>(zero);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(half))); cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(half)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half), cudaMemcpyAsync(
cudaMemcpyHostToDevice); alpha_zero_, &zero_tem, sizeof(half), cudaMemcpyHostToDevice);
} else { } else {
cudaDataType_t cudadataTypeIO = CUDA_R_32F; cudaDataType_t cudadataTypeIO = CUDA_R_32F;
cudaDataType_t cudaDataTypeS = CUDA_R_32F; cudaDataType_t cudaDataTypeS = CUDA_R_32F;
...@@ -400,41 +568,57 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, ...@@ -400,41 +568,57 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
#else #else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F; cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F;
#endif #endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(
&Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_, dyl::cublasLtMatrixLayoutCreate(&Adesc_,
cudadataTypeIO,
AopTranspose == CUBLAS_OP_N ? n_ : k_,
AopTranspose == CUBLAS_OP_N ? k_ : n_, AopTranspose == CUBLAS_OP_N ? k_ : n_,
AopTranspose == CUBLAS_OP_N ? n_ : k_)); AopTranspose == CUBLAS_OP_N ? n_ : k_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Adesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), Adesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridea),
sizeof(stridea))); sizeof(stridea)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(
&Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_, dyl::cublasLtMatrixLayoutCreate(&Bdesc_,
cudadataTypeIO,
BopTranspose == CUBLAS_OP_N ? k_ : m_,
BopTranspose == CUBLAS_OP_N ? m_ : k_, BopTranspose == CUBLAS_OP_N ? m_ : k_,
BopTranspose == CUBLAS_OP_N ? k_ : m_)); BopTranspose == CUBLAS_OP_N ? k_ : m_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Bdesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), Bdesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(strideb),
sizeof(strideb))); sizeof(strideb)));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_)); dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Cdesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), Cdesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridec),
sizeof(stridec))); sizeof(stridec)));
cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE; cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
...@@ -447,32 +631,39 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, ...@@ -447,32 +631,39 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
&matmulDesc_, cudaComputeType, cudaDataTypeS)); &matmulDesc_, cudaComputeType, cudaDataTypeS));
#endif #endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose, dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_TRANSA,
&AopTranspose,
sizeof(AopTranspose))); sizeof(AopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose, dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_TRANSB,
&BopTranspose,
sizeof(BopTranspose))); sizeof(BopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_POINTER_MODE,
&matmul_model,
sizeof(matmul_model))); sizeof(matmul_model)));
float alpha_tem = alpha_; float alpha_tem = alpha_;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_scale_), sizeof(float))); cudaMalloc(reinterpret_cast<void**>(&alpha_scale_), sizeof(float)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float), cudaMemcpyAsync(
cudaMemcpyHostToDevice); alpha_scale_, &alpha_tem, sizeof(float), cudaMemcpyHostToDevice);
float zero_tem = zero; float zero_tem = zero;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(float))); cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(float)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyAsync(
cudaMemcpyHostToDevice); alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice);
} }
} }
void MatmulPlugin::attachToContext( void MatmulPlugin::attachToContext(cudnnContext* cudnnContext,
cudnnContext* cudnnContext, cublasContext* cublasContext, cublasContext* cublasContext,
nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT { nvinfer1::IGpuAllocator* gpuAllocator)
TRT_NOEXCEPT {
dyl::cublasLtCreate(&cublas_); dyl::cublasLtCreate(&cublas_);
} }
...@@ -509,35 +700,76 @@ void MatmulPlugin::terminate() TRT_NOEXCEPT { ...@@ -509,35 +700,76 @@ void MatmulPlugin::terminate() TRT_NOEXCEPT {
} */ } */
} }
int MatmulPlugin::enqueue(int batchSize, const void* const* inputs, int MatmulPlugin::enqueue(int batchSize,
const void* const* inputs,
#if IS_TRT_VERSION_LT(8000) #if IS_TRT_VERSION_LT(8000)
void** outputs, void* workspace, void** outputs,
void* workspace,
cudaStream_t stream) { cudaStream_t stream) {
#else #else
void* const* outputs, void* workspace, void* const* outputs,
void* workspace,
cudaStream_t stream) TRT_NOEXCEPT { cudaStream_t stream) TRT_NOEXCEPT {
#endif #endif
if (type_ == nvinfer1::DataType::kINT8) { if (type_ == nvinfer1::DataType::kINT8) {
const int8_t* B = static_cast<const int8_t*>(inputs[0]); const int8_t* B = static_cast<const int8_t*>(inputs[0]);
const int8_t* A = static_cast<const int8_t*>(inputs[1]); const int8_t* A = static_cast<const int8_t*>(inputs[1]);
int8_t* C = static_cast<int8_t*>(outputs[0]); int8_t* C = static_cast<int8_t*>(outputs[0]);
Ltgemm_int8_linear( Ltgemm_int8_linear(cublas_,
cublas_, A, Adesc_, Atransform_, AtransformDesc_, transA_, B, Bdesc_, A,
Btransform_, BtransformDesc_, transB_, C, Cdesc_, Ctransform_, Adesc_,
CtransformDesc_, transformDescT_, transformDescN_, matmulDesc_, Atransform_,
alpha_scale_, alpha_zero_, alpha_one_, workspace, stream); AtransformDesc_,
transA_,
B,
Bdesc_,
Btransform_,
BtransformDesc_,
transB_,
C,
Cdesc_,
Ctransform_,
CtransformDesc_,
transformDescT_,
transformDescN_,
matmulDesc_,
alpha_scale_,
alpha_zero_,
alpha_one_,
workspace,
stream);
} else if (type_ == nvinfer1::DataType::kFLOAT) { } else if (type_ == nvinfer1::DataType::kFLOAT) {
const float* B = static_cast<const float*>(inputs[0]); const float* B = static_cast<const float*>(inputs[0]);
const float* A = static_cast<const float*>(inputs[1]); const float* A = static_cast<const float*>(inputs[1]);
float* C = static_cast<float*>(outputs[0]); float* C = static_cast<float*>(outputs[0]);
Ltgemm_fp32_linear(cublas_, A, Adesc_, B, Bdesc_, C, Cdesc_, matmulDesc_, Ltgemm_fp32_linear(cublas_,
alpha_scale_, alpha_zero_, workspace, stream); A,
Adesc_,
B,
Bdesc_,
C,
Cdesc_,
matmulDesc_,
alpha_scale_,
alpha_zero_,
workspace,
stream);
} else if (type_ == nvinfer1::DataType::kHALF) { } else if (type_ == nvinfer1::DataType::kHALF) {
const half* B = static_cast<const half*>(inputs[0]); const half* B = static_cast<const half*>(inputs[0]);
const half* A = static_cast<const half*>(inputs[1]); const half* A = static_cast<const half*>(inputs[1]);
half* C = static_cast<half*>(outputs[0]); half* C = static_cast<half*>(outputs[0]);
Ltgemm_fp16_linear(cublas_, A, Adesc_, B, Bdesc_, C, Cdesc_, matmulDesc_, Ltgemm_fp16_linear(cublas_,
alpha_scale_, alpha_zero_, workspace, stream); A,
Adesc_,
B,
Bdesc_,
C,
Cdesc_,
matmulDesc_,
alpha_scale_,
alpha_zero_,
workspace,
stream);
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"VarMessageToVarType:Unsupported type")); "VarMessageToVarType:Unsupported type"));
...@@ -546,13 +778,16 @@ int MatmulPlugin::enqueue(int batchSize, const void* const* inputs, ...@@ -546,13 +778,16 @@ int MatmulPlugin::enqueue(int batchSize, const void* const* inputs,
} }
nvinfer1::DataType MatmulPluginDynamic::getOutputDataType( nvinfer1::DataType MatmulPluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType* input_types, int index,
const nvinfer1::DataType* input_types,
int nb_inputs) const TRT_NOEXCEPT { int nb_inputs) const TRT_NOEXCEPT {
return input_types[0]; return input_types[0];
} }
nvinfer1::DimsExprs MatmulPluginDynamic::getOutputDimensions( nvinfer1::DimsExprs MatmulPluginDynamic::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, int outputIndex,
const nvinfer1::DimsExprs* inputs,
int nbInputs,
nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT { nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
nvinfer1::DimsExprs output_dims(inputs[0]); nvinfer1::DimsExprs output_dims(inputs[0]);
if (transB_) { if (transB_) {
...@@ -569,13 +804,17 @@ nvinfer1::DimsExprs MatmulPluginDynamic::getOutputDimensions( ...@@ -569,13 +804,17 @@ nvinfer1::DimsExprs MatmulPluginDynamic::getOutputDimensions(
} }
bool MatmulPluginDynamic::supportsFormatCombination( bool MatmulPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int pos,
const nvinfer1::PluginTensorDesc* inOut,
int nbInputs,
int nbOutputs) TRT_NOEXCEPT { int nbOutputs) TRT_NOEXCEPT {
PADDLE_ENFORCE_EQ(nbInputs, 2, PADDLE_ENFORCE_EQ(nbInputs,
2,
platform::errors::InvalidArgument("Must have 2 inputs, " platform::errors::InvalidArgument("Must have 2 inputs, "
"but got %d input(s). ", "but got %d input(s). ",
nbInputs)); nbInputs));
PADDLE_ENFORCE_EQ(nbOutputs, getNbOutputs(), PADDLE_ENFORCE_EQ(nbOutputs,
getNbOutputs(),
platform::errors::InvalidArgument("Must have 1 output, " platform::errors::InvalidArgument("Must have 1 output, "
"but got %d output(s). ", "but got %d output(s). ",
nbOutputs)); nbOutputs));
...@@ -591,7 +830,8 @@ bool MatmulPluginDynamic::supportsFormatCombination( ...@@ -591,7 +830,8 @@ bool MatmulPluginDynamic::supportsFormatCombination(
} }
void MatmulPluginDynamic::configurePlugin( void MatmulPluginDynamic::configurePlugin(
const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* outputs, const nvinfer1::DynamicPluginTensorDesc* outputs,
int nbOutputs) TRT_NOEXCEPT { int nbOutputs) TRT_NOEXCEPT {
float inscale_0 = inputs[0].desc.scale; float inscale_0 = inputs[0].desc.scale;
...@@ -632,46 +872,49 @@ void MatmulPluginDynamic::configurePlugin( ...@@ -632,46 +872,49 @@ void MatmulPluginDynamic::configurePlugin(
} }
PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
reinterpret_cast<void**>(&alpha_scale_), n_max * sizeof(float))); reinterpret_cast<void**>(&alpha_scale_), n_max * sizeof(float)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_max * sizeof(float), cudaMemcpyAsync(alpha_scale_,
&alpha_tem[0],
n_max * sizeof(float),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
float zero_tem = zero; float zero_tem = zero;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(float))); cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(float)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyAsync(
cudaMemcpyHostToDevice); alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice);
float one_tem = 1; float one_tem = 1;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_one_), sizeof(float))); cudaMalloc(reinterpret_cast<void**>(&alpha_one_), sizeof(float)));
cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float), cudaMemcpyAsync(
cudaMemcpyHostToDevice); alpha_one_, &one_tem, sizeof(float), cudaMemcpyHostToDevice);
} else if (type_ == nvinfer1::DataType::kHALF) { } else if (type_ == nvinfer1::DataType::kHALF) {
half alpha_tem = static_cast<half>(alpha_); half alpha_tem = static_cast<half>(alpha_);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_scale_), sizeof(half))); cudaMalloc(reinterpret_cast<void**>(&alpha_scale_), sizeof(half)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half), cudaMemcpyAsync(
cudaMemcpyHostToDevice); alpha_scale_, &alpha_tem, sizeof(half), cudaMemcpyHostToDevice);
half zero_tem = static_cast<half>(zero); half zero_tem = static_cast<half>(zero);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(half))); cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(half)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half), cudaMemcpyAsync(
cudaMemcpyHostToDevice); alpha_zero_, &zero_tem, sizeof(half), cudaMemcpyHostToDevice);
} else { } else {
float alpha_tem = alpha_; float alpha_tem = alpha_;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_scale_), sizeof(float))); cudaMalloc(reinterpret_cast<void**>(&alpha_scale_), sizeof(float)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float), cudaMemcpyAsync(
cudaMemcpyHostToDevice); alpha_scale_, &alpha_tem, sizeof(float), cudaMemcpyHostToDevice);
float zero_tem = zero; float zero_tem = zero;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(float))); cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(float)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyAsync(
cudaMemcpyHostToDevice); alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice);
} }
} }
void MatmulPluginDynamic::attachToContext( void MatmulPluginDynamic::attachToContext(cudnnContext* cudnnContext,
cudnnContext* cudnnContext, cublasContext* cublasContext, cublasContext* cublasContext,
nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT { nvinfer1::IGpuAllocator* gpuAllocator)
TRT_NOEXCEPT {
dyl::cublasLtCreate(&cublas_); dyl::cublasLtCreate(&cublas_);
} }
...@@ -710,7 +953,8 @@ void MatmulPluginDynamic::terminate() TRT_NOEXCEPT { ...@@ -710,7 +953,8 @@ void MatmulPluginDynamic::terminate() TRT_NOEXCEPT {
int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc, const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, const void* const* inputs,
void* const* outputs, void* workspace, void* const* outputs,
void* workspace,
cudaStream_t stream) TRT_NOEXCEPT { cudaStream_t stream) TRT_NOEXCEPT {
const auto Input0Desc = inputDesc[0]; const auto Input0Desc = inputDesc[0];
const auto Input1Desc = inputDesc[1]; const auto Input1Desc = inputDesc[1];
...@@ -770,64 +1014,88 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, ...@@ -770,64 +1014,88 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
cublasLtOrder_t COL32 = CUBLASLT_ORDER_COL32; cublasLtOrder_t COL32 = CUBLASLT_ORDER_COL32;
cublasLtOrder_t COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C; cublasLtOrder_t COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C;
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(
&Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k, dyl::cublasLtMatrixLayoutCreate(&Adesc,
cudadataTypeIO,
AopTranspose == CUBLAS_OP_N ? n : k,
AopTranspose == CUBLAS_OP_N ? k : n, AopTranspose == CUBLAS_OP_N ? k : n,
AopTranspose == CUBLAS_OP_N ? n : k)); AopTranspose == CUBLAS_OP_N ? n : k));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Adesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), Adesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridea),
sizeof(stridea))); sizeof(stridea)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(
&Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m, dyl::cublasLtMatrixLayoutCreate(&Bdesc,
cudadataTypeIO,
BopTranspose == CUBLAS_OP_N ? k : m,
BopTranspose == CUBLAS_OP_N ? m : k, BopTranspose == CUBLAS_OP_N ? m : k,
BopTranspose == CUBLAS_OP_N ? k : m)); BopTranspose == CUBLAS_OP_N ? k : m));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Bdesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), Bdesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(strideb),
sizeof(strideb))); sizeof(strideb)));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n)); dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Cdesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), Cdesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridec),
sizeof(stridec))); sizeof(stridec)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&AtransformDesc, cudadataTypeIO, n, k, ldatransform)); &AtransformDesc, cudadataTypeIO, n, k, ldatransform));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
AtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(AtransformDesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32))); AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&BtransformDesc, cudadataTypeIO, m, k, ldbtransform)); &BtransformDesc, cudadataTypeIO, m, k, ldbtransform));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
BtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(BtransformDesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL4_4R2_8C, dyl::cublasLtMatrixLayoutSetAttribute(BtransformDesc,
CUBLASLT_MATRIX_LAYOUT_ORDER,
&COL4_4R2_8C,
sizeof(COL4_4R2_8C))); sizeof(COL4_4R2_8C)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&CtransformDesc, cudadataTypeIO, n, m, ldctransform)); &CtransformDesc, cudadataTypeIO, n, m, ldctransform));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
CtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(CtransformDesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32))); CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
...@@ -837,23 +1105,33 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, ...@@ -837,23 +1105,33 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixTransformDescCreate(&transformDescT, cudaDataTypeS)); dyl::cublasLtMatrixTransformDescCreate(&transformDescT, cudaDataTypeS));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, transformDescT,
&cudaDataTypeS, sizeof(cudaDataTypeS))); CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS,
sizeof(cudaDataTypeS)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &Transpose, transformDescT,
CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA,
&Transpose,
sizeof(Transpose))); sizeof(Transpose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, transformDescT,
&transform_model, sizeof(transform_model))); CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model,
sizeof(transform_model)));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixTransformDescCreate(&transformDescN, cudaDataTypeS)); dyl::cublasLtMatrixTransformDescCreate(&transformDescN, cudaDataTypeS));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescN, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, transformDescN,
&cudaDataTypeS, sizeof(cudaDataTypeS))); CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS,
sizeof(cudaDataTypeS)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescN, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, transformDescN,
&transform_model, sizeof(transform_model))); CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model,
sizeof(transform_model)));
cublasOperation_t ATranspose = CUBLAS_OP_N, BTranspose = CUBLAS_OP_T; cublasOperation_t ATranspose = CUBLAS_OP_N, BTranspose = CUBLAS_OP_T;
cublasLtPointerMode_t matmul_model = cublasLtPointerMode_t matmul_model =
...@@ -867,24 +1145,48 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, ...@@ -867,24 +1145,48 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
&matmulDesc, cudaComputeType, cudaDataTypeS)); &matmulDesc, cudaComputeType, cudaDataTypeS));
#endif #endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &ATranspose, dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_TRANSA,
&ATranspose,
sizeof(ATranspose))); sizeof(ATranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BTranspose, dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_TRANSB,
&BTranspose,
sizeof(BTranspose))); sizeof(BTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_POINTER_MODE,
&matmul_model,
sizeof(matmul_model))); sizeof(matmul_model)));
const int8_t* B = static_cast<const int8_t*>(inputs[0]); const int8_t* B = static_cast<const int8_t*>(inputs[0]);
const int8_t* A = static_cast<const int8_t*>(inputs[1]); const int8_t* A = static_cast<const int8_t*>(inputs[1]);
int8_t* C = static_cast<int8_t*>(outputs[0]); int8_t* C = static_cast<int8_t*>(outputs[0]);
Ltgemm_int8_linear(cublas_, A, Adesc, Atransform_, AtransformDesc, transA_, Ltgemm_int8_linear(cublas_,
B, Bdesc, Btransform_, BtransformDesc, transB_, C, Cdesc, A,
Ctransform_, CtransformDesc, transformDescT, Adesc,
transformDescN, matmulDesc, alpha_scale_, alpha_zero_, Atransform_,
alpha_one_, workspace, stream); AtransformDesc,
transA_,
B,
Bdesc,
Btransform_,
BtransformDesc,
transB_,
C,
Cdesc,
Ctransform_,
CtransformDesc,
transformDescT,
transformDescN,
matmulDesc,
alpha_scale_,
alpha_zero_,
alpha_one_,
workspace,
stream);
} else if (type_ == nvinfer1::DataType::kHALF) { } else if (type_ == nvinfer1::DataType::kHALF) {
cudaDataType_t cudadataTypeIO = CUDA_R_16F; cudaDataType_t cudadataTypeIO = CUDA_R_16F;
cudaDataType_t cudaDataTypeS = CUDA_R_16F; cudaDataType_t cudaDataTypeS = CUDA_R_16F;
...@@ -893,41 +1195,57 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, ...@@ -893,41 +1195,57 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
#else #else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F; cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F;
#endif #endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(
&Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k, dyl::cublasLtMatrixLayoutCreate(&Adesc,
cudadataTypeIO,
AopTranspose == CUBLAS_OP_N ? n : k,
AopTranspose == CUBLAS_OP_N ? k : n, AopTranspose == CUBLAS_OP_N ? k : n,
AopTranspose == CUBLAS_OP_N ? n : k)); AopTranspose == CUBLAS_OP_N ? n : k));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Adesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), Adesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridea),
sizeof(stridea))); sizeof(stridea)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(
&Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m, dyl::cublasLtMatrixLayoutCreate(&Bdesc,
cudadataTypeIO,
BopTranspose == CUBLAS_OP_N ? k : m,
BopTranspose == CUBLAS_OP_N ? m : k, BopTranspose == CUBLAS_OP_N ? m : k,
BopTranspose == CUBLAS_OP_N ? k : m)); BopTranspose == CUBLAS_OP_N ? k : m));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Bdesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), Bdesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(strideb),
sizeof(strideb))); sizeof(strideb)));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n)); dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Cdesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), Cdesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridec),
sizeof(stridec))); sizeof(stridec)));
cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE; cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
...@@ -940,21 +1258,37 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, ...@@ -940,21 +1258,37 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
&matmulDesc, cudaComputeType, cudaDataTypeS)); &matmulDesc, cudaComputeType, cudaDataTypeS));
#endif #endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose, dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_TRANSA,
&AopTranspose,
sizeof(AopTranspose))); sizeof(AopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose, dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_TRANSB,
&BopTranspose,
sizeof(BopTranspose))); sizeof(BopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_POINTER_MODE,
&matmul_model,
sizeof(matmul_model))); sizeof(matmul_model)));
const half* B = static_cast<const half*>(inputs[0]); const half* B = static_cast<const half*>(inputs[0]);
const half* A = static_cast<const half*>(inputs[1]); const half* A = static_cast<const half*>(inputs[1]);
half* C = static_cast<half*>(outputs[0]); half* C = static_cast<half*>(outputs[0]);
Ltgemm_fp16_linear(cublas_, A, Adesc, B, Bdesc, C, Cdesc, matmulDesc, Ltgemm_fp16_linear(cublas_,
alpha_scale_, alpha_zero_, workspace, stream); A,
Adesc,
B,
Bdesc,
C,
Cdesc,
matmulDesc,
alpha_scale_,
alpha_zero_,
workspace,
stream);
} else { } else {
cudaDataType_t cudadataTypeIO = CUDA_R_32F; cudaDataType_t cudadataTypeIO = CUDA_R_32F;
cudaDataType_t cudaDataTypeS = CUDA_R_32F; cudaDataType_t cudaDataTypeS = CUDA_R_32F;
...@@ -963,41 +1297,57 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, ...@@ -963,41 +1297,57 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
#else #else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F; cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F;
#endif #endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(
&Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k, dyl::cublasLtMatrixLayoutCreate(&Adesc,
cudadataTypeIO,
AopTranspose == CUBLAS_OP_N ? n : k,
AopTranspose == CUBLAS_OP_N ? k : n, AopTranspose == CUBLAS_OP_N ? k : n,
AopTranspose == CUBLAS_OP_N ? n : k)); AopTranspose == CUBLAS_OP_N ? n : k));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Adesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), Adesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridea),
sizeof(stridea))); sizeof(stridea)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( PADDLE_ENFORCE_GPU_SUCCESS(
&Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m, dyl::cublasLtMatrixLayoutCreate(&Bdesc,
cudadataTypeIO,
BopTranspose == CUBLAS_OP_N ? k : m,
BopTranspose == CUBLAS_OP_N ? m : k, BopTranspose == CUBLAS_OP_N ? m : k,
BopTranspose == CUBLAS_OP_N ? k : m)); BopTranspose == CUBLAS_OP_N ? k : m));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Bdesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), Bdesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(strideb),
sizeof(strideb))); sizeof(strideb)));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n)); dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, dyl::cublasLtMatrixLayoutSetAttribute(Cdesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO))); sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), Cdesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridec),
sizeof(stridec))); sizeof(stridec)));
cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE; cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
...@@ -1010,21 +1360,37 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, ...@@ -1010,21 +1360,37 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
&matmulDesc, cudaComputeType, cudaDataTypeS)); &matmulDesc, cudaComputeType, cudaDataTypeS));
#endif #endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose, dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_TRANSA,
&AopTranspose,
sizeof(AopTranspose))); sizeof(AopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose, dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_TRANSB,
&BopTranspose,
sizeof(BopTranspose))); sizeof(BopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(
matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_POINTER_MODE,
&matmul_model,
sizeof(matmul_model))); sizeof(matmul_model)));
const float* B = static_cast<const float*>(inputs[0]); const float* B = static_cast<const float*>(inputs[0]);
const float* A = static_cast<const float*>(inputs[1]); const float* A = static_cast<const float*>(inputs[1]);
float* C = static_cast<float*>(outputs[0]); float* C = static_cast<float*>(outputs[0]);
Ltgemm_fp32_linear(cublas_, A, Adesc, B, Bdesc, C, Cdesc, matmulDesc, Ltgemm_fp32_linear(cublas_,
alpha_scale_, alpha_zero_, workspace, stream); A,
Adesc,
B,
Bdesc,
C,
Cdesc,
matmulDesc,
alpha_scale_,
alpha_zero_,
workspace,
stream);
} }
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
......
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef POLY_UTIL_CC_ #ifndef POLY_UTIL_CC_
#define POLY_UTIL_CC_ #define POLY_UTIL_CC_
...@@ -26,61 +25,62 @@ using gpc::gpc_free_polygon; ...@@ -26,61 +25,62 @@ using gpc::gpc_free_polygon;
using gpc::gpc_polygon_clip; using gpc::gpc_polygon_clip;
template <class T> template <class T>
void Array2PointVec(const T*& box, const size_t box_size, void Array2PointVec(const T* box,
std::vector<Point_<T>>& vec) { const size_t box_size,
std::vector<Point_<T>>* vec) {
size_t pts_num = box_size / 2; size_t pts_num = box_size / 2;
vec.resize(pts_num); (*vec).resize(pts_num);
for (size_t i = 0; i < pts_num; i++) { for (size_t i = 0; i < pts_num; i++) {
vec.at(i).x = box[2 * i]; (*vec).at(i).x = box[2 * i];
vec.at(i).y = box[2 * i + 1]; (*vec).at(i).y = box[2 * i + 1];
} }
} }
template <class T> template <class T>
void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly) { void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly) {
size_t pts_num = box_size / 2; size_t pts_num = box_size / 2;
poly.num_contours = 1; (*poly).num_contours = 1;
poly.hole = (int*)malloc(sizeof(int)); (*poly).hole = reinterpret_cast<int*>(malloc(sizeof(int)));
poly.hole[0] = 0; (*poly).hole[0] = 0;
poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list)); (*poly).contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
poly.contour->num_vertices = pts_num; (*poly).contour->num_vertices = pts_num;
poly.contour->vertex = (*poly).contour->vertex =
(gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num); (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
for (size_t i = 0; i < pts_num; ++i) { for (size_t i = 0; i < pts_num; ++i) {
poly.contour->vertex[i].x = box[2 * i]; (*poly).contour->vertex[i].x = box[2 * i];
poly.contour->vertex[i].y = box[2 * i + 1]; (*poly).contour->vertex[i].y = box[2 * i + 1];
} }
} }
template <class T> template <class T>
void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon& poly) { void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon* poly) {
int pts_num = vec.size(); int pts_num = vec.size();
poly.num_contours = 1; (*poly).num_contours = 1;
poly.hole = (int*)malloc(sizeof(int)); (*poly).hole = reinterpret_cast<int*>(malloc(sizeof(int)));
poly.hole[0] = 0; (*poly).hole[0] = 0;
poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list)); (*poly).contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
poly.contour->num_vertices = pts_num; (*poly).contour->num_vertices = pts_num;
poly.contour->vertex = (*poly).contour->vertex =
(gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num); (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
for (size_t i = 0; i < pts_num; ++i) { for (size_t i = 0; i < pts_num; ++i) {
poly.contour->vertex[i].x = vec[i].x; (*poly).contour->vertex[i].x = vec[i].x;
poly.contour->vertex[i].y = vec[i].y; (*poly).contour->vertex[i].y = vec[i].y;
} }
} }
template <class T> template <class T>
void Poly2PointVec(const gpc::gpc_vertex_list& contour, void Poly2PointVec(const gpc::gpc_vertex_list& contour,
std::vector<Point_<T>>& vec) { std::vector<Point_<T>>* vec) {
int pts_num = contour.num_vertices; int pts_num = contour.num_vertices;
vec.resize(pts_num); (*vec).resize(pts_num);
for (int i = 0; i < pts_num; i++) { for (int i = 0; i < pts_num; i++) {
vec.at(i).x = contour.vertex[i].x; (*vec).at(i).x = contour.vertex[i].x;
vec.at(i).y = contour.vertex[i].y; (*vec).at(i).y = contour.vertex[i].y;
} }
} }
template <class T> template <class T>
T GetContourArea(std::vector<Point_<T>>& vec) { T GetContourArea(const std::vector<Point_<T>>& vec) {
size_t pts_num = vec.size(); size_t pts_num = vec.size();
if (pts_num < 3) return T(0.); if (pts_num < 3) return T(0.);
T area = T(0.); T area = T(0.);
...@@ -96,17 +96,19 @@ T PolyArea(const T* box, const size_t box_size, const bool normalized) { ...@@ -96,17 +96,19 @@ T PolyArea(const T* box, const size_t box_size, const bool normalized) {
// If coordinate values are is invalid // If coordinate values are is invalid
// if area size <= 0, return 0. // if area size <= 0, return 0.
std::vector<Point_<T>> vec; std::vector<Point_<T>> vec;
Array2PointVec<T>(box, box_size, vec); Array2PointVec<T>(box, box_size, &vec);
return GetContourArea<T>(vec); return GetContourArea<T>(vec);
} }
template <class T> template <class T>
T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size, T PolyOverlapArea(const T* box1,
const T* box2,
const size_t box_size,
const bool normalized) { const bool normalized) {
gpc::gpc_polygon poly1; gpc::gpc_polygon poly1;
gpc::gpc_polygon poly2; gpc::gpc_polygon poly2;
Array2Poly<T>(box1, box_size, poly1); Array2Poly<T>(box1, box_size, &poly1);
Array2Poly<T>(box2, box_size, poly2); Array2Poly<T>(box2, box_size, &poly2);
gpc::gpc_polygon respoly; gpc::gpc_polygon respoly;
gpc::gpc_op op = gpc::GPC_INT; gpc::gpc_op op = gpc::GPC_INT;
gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly); gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly);
...@@ -115,7 +117,7 @@ T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size, ...@@ -115,7 +117,7 @@ T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
int contour_num = respoly.num_contours; int contour_num = respoly.num_contours;
for (int i = 0; i < contour_num; ++i) { for (int i = 0; i < contour_num; ++i) {
std::vector<Point_<T>> resvec; std::vector<Point_<T>> resvec;
Poly2PointVec<T>(respoly.contour[i], resvec); Poly2PointVec<T>(respoly.contour[i], &resvec);
// inter_area += std::fabs(cv::contourArea(resvec)) + 0.5f * // inter_area += std::fabs(cv::contourArea(resvec)) + 0.5f *
// (cv::arcLength(resvec, true)); // (cv::arcLength(resvec, true));
inter_area += GetContourArea<T>(resvec); inter_area += GetContourArea<T>(resvec);
......
...@@ -11,9 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,9 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#ifndef POLY_UTIL_H_
#define POLY_UTIL_H_
#include <vector> #include <vector>
...@@ -44,31 +42,32 @@ class Point_ { ...@@ -44,31 +42,32 @@ class Point_ {
}; };
template <class T> template <class T>
void Array2PointVec(const T*& box, const size_t box_size, void Array2PointVec(const T* box,
std::vector<Point_<T>>& vec); const size_t box_size,
std::vector<Point_<T>>* vec);
template <class T> template <class T>
void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly); void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly);
template <class T> template <class T>
void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon& poly); void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon* poly);
template <class T> template <class T>
void Poly2PointVec(const gpc::gpc_vertex_list& contour, void Poly2PointVec(const gpc::gpc_vertex_list& contour,
std::vector<Point_<T>>& vec); std::vector<Point_<T>>* vec);
template <class T> template <class T>
T GetContourArea(std::vector<Point_<T>>& vec); T GetContourArea(const std::vector<Point_<T>>& vec);
template <class T> template <class T>
T PolyArea(const T* box, const size_t box_size, const bool normalized); T PolyArea(const T* box, const size_t box_size, const bool normalized);
template <class T> template <class T>
T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size, T PolyOverlapArea(const T* box1,
const T* box2,
const size_t box_size,
const bool normalized); const bool normalized);
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#include "paddle/fluid/operators/detection/poly_util.cc" #include "paddle/fluid/operators/detection/poly_util.cc"
#endif // POLY_UTIL_H_
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h" #include "paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h"
namespace { namespace {
...@@ -49,19 +48,23 @@ static std::vector<int64_t> Transpose(const std::vector<int64_t>& x, ...@@ -49,19 +48,23 @@ static std::vector<int64_t> Transpose(const std::vector<int64_t>& x,
size_t axis_size = axis.size(); size_t axis_size = axis.size();
auto axis_set = std::set<int>(axis.begin(), axis.end()); auto axis_set = std::set<int>(axis.begin(), axis.end());
PADDLE_ENFORCE_EQ(axis_set.size(), axis_size, PADDLE_ENFORCE_EQ(axis_set.size(),
axis_size,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"In an axis array, elements must be unique.")); "In an axis array, elements must be unique."));
PADDLE_ENFORCE_EQ(in_rank, axis_size, PADDLE_ENFORCE_EQ(in_rank,
axis_size,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"The input dimension's size " "The input dimension's size "
"should be equal to the axis's size. " "should be equal to the axis's size. "
"But received dimension is %d, " "But received dimension is %d, "
"axis's size is %d", "axis's size is %d",
in_rank, axis_size)); in_rank,
axis_size));
PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), axis_size, PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()),
axis_size,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"Axis values must be ranging from 0 to (dims - 1).")); "Axis values must be ranging from 0 to (dims - 1)."));
...@@ -85,7 +88,8 @@ std::vector<int64_t> GetInputStrides(const ExecutionContext& ctx, ...@@ -85,7 +88,8 @@ std::vector<int64_t> GetInputStrides(const ExecutionContext& ctx,
auto& MatrixDimsFromVector = auto& MatrixDimsFromVector =
input_name == "X" ? RowMatrixDimsFromVector : ColumnMatrixDimsFromVector; input_name == "X" ? RowMatrixDimsFromVector : ColumnMatrixDimsFromVector;
phi::funcs::MatDescriptor mat_dim = phi::funcs::CreateMatrixDescriptor( phi::funcs::MatDescriptor mat_dim = phi::funcs::CreateMatrixDescriptor(
MatrixDimsFromVector(new_dims), 0, MatrixDimsFromVector(new_dims),
0,
ctx.Attr<bool>(std::string("trans_") + ctx.Attr<bool>(std::string("trans_") +
static_cast<char>(std::tolower(input_name[0])))); static_cast<char>(std::tolower(input_name[0]))));
...@@ -125,16 +129,27 @@ template <typename T> ...@@ -125,16 +129,27 @@ template <typename T>
void ExecuteMatMulV2(const ExecutionContext& ctx, void ExecuteMatMulV2(const ExecutionContext& ctx,
const MKLDNNDeviceContext& dev_ctx, const MKLDNNDeviceContext& dev_ctx,
const dnnl::engine onednn_engine, const dnnl::engine onednn_engine,
paddle::platform::Place cpu_place, const Tensor* x, paddle::platform::Place cpu_place,
std::vector<int64_t>& x_dims, bool trans_x, const Tensor* x,
const Tensor* y, std::vector<int64_t>& y_dims, const std::vector<int64_t>& x_dims,
bool trans_y, Tensor* out, std::vector<int64_t>& out_dims, bool trans_x,
const Tensor* y,
const std::vector<int64_t>& y_dims,
bool trans_y,
Tensor* out,
const std::vector<int64_t>& out_dims,
int execution_number = 0) { int execution_number = 0) {
std::vector<int64_t> x_strides_override = GetInputStrides(ctx, "X"); std::vector<int64_t> x_strides_override = GetInputStrides(ctx, "X");
std::vector<int64_t> y_strides_override = GetInputStrides(ctx, "Y"); std::vector<int64_t> y_strides_override = GetInputStrides(ctx, "Y");
MatMulV2MKLDNNHandler<T> handler(onednn_engine, ctx.GetPlace(), x_dims, MatMulV2MKLDNNHandler<T> handler(onednn_engine,
trans_x, y_dims, trans_y, IsOutputFused(ctx), ctx.GetPlace(),
x_strides_override, y_strides_override); x_dims,
trans_x,
y_dims,
trans_y,
IsOutputFused(ctx),
x_strides_override,
y_strides_override);
const auto src_memory_p = handler.AcquireSrcMemory(x); const auto src_memory_p = handler.AcquireSrcMemory(x);
const auto weights_memory_p = handler.AcquireWeightsMemory(y); const auto weights_memory_p = handler.AcquireWeightsMemory(y);
...@@ -177,44 +192,48 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> { ...@@ -177,44 +192,48 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> {
void CalculateMatrixDims(const ExecutionContext& ctx, void CalculateMatrixDims(const ExecutionContext& ctx,
const std::vector<int64_t>& x_dims, const std::vector<int64_t>& x_dims,
const std::vector<int64_t>& y_dims, const std::vector<int64_t>& y_dims,
std::vector<int64_t>& x_bd_dims, std::vector<int64_t>* x_bd_dims,
std::vector<int64_t>& y_bd_dims, std::vector<int64_t>* y_bd_dims,
std::vector<int64_t>& out_dims, Tensor* out) const { std::vector<int64_t>* out_dims,
Tensor* out) const {
if (x_dims.size() == 1) { if (x_dims.size() == 1) {
x_bd_dims[x_bd_dims.size() - 1] = x_dims[0]; (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[0];
} else if (x_dims.size() == 2) { } else if (x_dims.size() == 2) {
x_bd_dims[x_bd_dims.size() - 1] = x_dims[1]; (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[1];
x_bd_dims[x_bd_dims.size() - 2] = x_dims[0]; (*x_bd_dims)[(*x_bd_dims).size() - 2] = x_dims[0];
} else { } else {
for (size_t i = 0; i < x_dims.size(); ++i) { for (size_t i = 0; i < x_dims.size(); ++i) {
x_bd_dims[x_bd_dims.size() - x_dims.size() + i] = x_dims[i]; (*x_bd_dims)[(*x_bd_dims).size() - x_dims.size() + i] = x_dims[i];
} }
} }
if (y_dims.size() == 1) { if (y_dims.size() == 1) {
y_bd_dims[x_bd_dims.size() - 2] = y_dims[0]; (*y_bd_dims)[(*x_bd_dims).size() - 2] = y_dims[0];
} else if (y_dims.size() == 2) { } else if (y_dims.size() == 2) {
y_bd_dims[y_bd_dims.size() - 1] = y_dims[1]; (*y_bd_dims)[(*y_bd_dims).size() - 1] = y_dims[1];
y_bd_dims[y_bd_dims.size() - 2] = y_dims[0]; (*y_bd_dims)[(*y_bd_dims).size() - 2] = y_dims[0];
} else { } else {
for (size_t i = 0; i < y_dims.size(); ++i) { for (size_t i = 0; i < y_dims.size(); ++i) {
y_bd_dims[y_bd_dims.size() - y_dims.size() + i] = y_dims[i]; (*y_bd_dims)[(*y_bd_dims).size() - y_dims.size() + i] = y_dims[i];
} }
} }
if (!IsOutputFused(ctx) && x_dims.size() > 2 && y_dims.size() > 2) { if (!IsOutputFused(ctx) && x_dims.size() > 2 && y_dims.size() > 2) {
for (size_t i = 0; i < x_bd_dims.size() - 2; ++i) { for (size_t i = 0; i < (*x_bd_dims).size() - 2; ++i) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] == 1 || (*x_bd_dims)[i] == (*y_bd_dims)[i] || (*x_bd_dims)[i] == 1 ||
y_bd_dims[i] == 1, (*y_bd_dims)[i] == 1,
true, true,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"Tensor dimensions are incorrect for broadcasting." "Tensor dimensions are incorrect for broadcasting."
"Dimensions in X and Y must be same or equal to 1, but " "Dimensions in X and Y must be same or equal to 1, but "
"received x_dim[%d]=%d and y_dims[%d]= %d", "received x_dim[%d]=%d and y_dims[%d]= %d",
i, x_bd_dims[i], i, y_bd_dims[i])); i,
out_dims[i] = std::max(x_bd_dims[i], y_bd_dims[i]); (*x_bd_dims)[i],
i,
(*y_bd_dims)[i]));
(*out_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]);
} }
out->Resize(phi::make_ddim(out_dims)); out->Resize(phi::make_ddim((*out_dims)));
} }
} }
...@@ -238,11 +257,20 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> { ...@@ -238,11 +257,20 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> {
std::vector<int64_t> x_bd_dims(ndims, 1); std::vector<int64_t> x_bd_dims(ndims, 1);
std::vector<int64_t> y_bd_dims(ndims, 1); std::vector<int64_t> y_bd_dims(ndims, 1);
CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, out_dims, CalculateMatrixDims(
out); ctx, x_dims, y_dims, &x_bd_dims, &y_bd_dims, &out_dims, out);
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, ExecuteMatMulV2<T>(ctx,
x_bd_dims, trans_x, y, y_bd_dims, trans_y, out, dev_ctx,
onednn_engine,
ctx.GetPlace(),
x,
x_bd_dims,
trans_x,
y,
y_bd_dims,
trans_y,
out,
out_dims); out_dims);
} }
}; };
...@@ -253,36 +281,46 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> { ...@@ -253,36 +281,46 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); } void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); }
private: private:
void CalculateGradMatrixDims(const ExecutionContext& ctx, Tensor* dx_tmp, void CalculateGradMatrixDims(const ExecutionContext& ctx,
Tensor* dx_tmp,
Tensor* dy_tmp, Tensor* dy_tmp,
const std::vector<int64_t>& dx_dims, const std::vector<int64_t>& dx_dims,
const std::vector<int64_t>& dy_dims, const std::vector<int64_t>& dy_dims,
std::vector<int64_t>& dx_bd_dims, std::vector<int64_t>* dx_bd_dims,
std::vector<int64_t>& dy_bd_dims) const { std::vector<int64_t>* dy_bd_dims) const {
for (size_t i = 0; i < dx_dims.size() - 2; ++i) { for (size_t i = 0; i < dx_dims.size() - 2; ++i) {
if (dx_dims[i] != dy_dims[i]) { if (dx_dims[i] != dy_dims[i]) {
if (dx_dims[i] == 1) { if (dx_dims[i] == 1) {
dx_bd_dims[i] = dy_dims[i]; (*dx_bd_dims)[i] = dy_dims[i];
} else { } else {
dy_bd_dims[i] = dx_dims[i]; (*dy_bd_dims)[i] = dx_dims[i];
} }
} }
} }
dx_tmp->Resize(phi::make_ddim(dx_bd_dims)); dx_tmp->Resize(phi::make_ddim((*dx_bd_dims)));
dx_tmp->mutable_data<T>(ctx.GetPlace()); dx_tmp->mutable_data<T>(ctx.GetPlace());
dy_tmp->Resize(phi::make_ddim(dy_bd_dims)); dy_tmp->Resize(phi::make_ddim((*dy_bd_dims)));
dy_tmp->mutable_data<T>(ctx.GetPlace()); dy_tmp->mutable_data<T>(ctx.GetPlace());
} }
void ReduceSumForMatmulGradOutput( void ReduceSumForMatmulGradOutput(
const ExecutionContext& ctx, const MKLDNNDeviceContext& dev_ctx, const ExecutionContext& ctx,
const dnnl::engine onednn_engine, const Tensor* dx_tmp, Tensor* dx, const MKLDNNDeviceContext& dev_ctx,
std::vector<int64_t>& dx_dims, const dnnl::engine onednn_engine,
const Tensor* dx_tmp,
Tensor* dx,
const std::vector<int64_t>& dx_dims,
const std::vector<int64_t>& squeezed_dims) const { const std::vector<int64_t>& squeezed_dims) const {
paddle::platform::ReductionMKLDNNHandler<T> handler( paddle::platform::ReductionMKLDNNHandler<T> handler(
dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, dnnl::algorithm::reduction_sum,
ctx.GetPlace(), dx_tmp, dx, dx_dims); 0.0f,
0.0f,
onednn_engine,
ctx.GetPlace(),
dx_tmp,
dx,
dx_dims);
auto src_memory_p = handler.AcquireSrcMemory(dx_tmp); auto src_memory_p = handler.AcquireSrcMemory(dx_tmp);
auto dst_memory_p = handler.AcquireDstMemory(dx); auto dst_memory_p = handler.AcquireDstMemory(dx);
...@@ -326,8 +364,8 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> { ...@@ -326,8 +364,8 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
} else if (x_dims.size() != y_dims.size()) { } else if (x_dims.size() != y_dims.size()) {
is_broadcast = true; is_broadcast = true;
} else { } else {
is_broadcast = is_broadcast = !std::equal(x_dims.cbegin(),
!std::equal(x_dims.cbegin(), x_dims.cbegin() + x_dims.size() - 2, x_dims.cbegin() + x_dims.size() - 2,
y_dims.cbegin()); y_dims.cbegin());
} }
...@@ -362,44 +400,138 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> { ...@@ -362,44 +400,138 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
std::vector<int64_t> dx_bd_dims(x_dims); std::vector<int64_t> dx_bd_dims(x_dims);
std::vector<int64_t> dy_bd_dims(y_dims); std::vector<int64_t> dy_bd_dims(y_dims);
CalculateGradMatrixDims(ctx, &dx_tmp, &dy_tmp, x_dims, y_dims, dx_bd_dims, CalculateGradMatrixDims(
dy_bd_dims); ctx, &dx_tmp, &dy_tmp, x_dims, y_dims, &dx_bd_dims, &dy_bd_dims);
if (trans_x && trans_y) { if (trans_x && trans_y) {
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, y_dims, ExecuteMatMulV2<T>(ctx,
true, dout, dout_dims, true, &dx_tmp, dx_bd_dims, 1); dev_ctx,
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, onednn_engine,
dout_dims, true, x, x_dims, true, &dy_tmp, dy_bd_dims, ctx.GetPlace(),
y,
y_dims,
true,
dout,
dout_dims,
true,
&dx_tmp,
dx_bd_dims,
1);
ExecuteMatMulV2<T>(ctx,
dev_ctx,
onednn_engine,
ctx.GetPlace(),
dout,
dout_dims,
true,
x,
x_dims,
true,
&dy_tmp,
dy_bd_dims,
2); 2);
} else if (trans_x) { } else if (trans_x) {
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, y_dims, ExecuteMatMulV2<T>(ctx,
false, dout, dout_dims, true, &dx_tmp, dx_bd_dims, 1); dev_ctx,
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_dims, onednn_engine,
false, dout, dout_dims, false, &dy_tmp, dy_bd_dims, 2); ctx.GetPlace(),
y,
y_dims,
false,
dout,
dout_dims,
true,
&dx_tmp,
dx_bd_dims,
1);
ExecuteMatMulV2<T>(ctx,
dev_ctx,
onednn_engine,
ctx.GetPlace(),
x,
x_dims,
false,
dout,
dout_dims,
false,
&dy_tmp,
dy_bd_dims,
2);
} else if (trans_y) { } else if (trans_y) {
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, ExecuteMatMulV2<T>(ctx,
dout_dims, false, y, y_dims, false, &dx_tmp, dev_ctx,
dx_bd_dims, 1); onednn_engine,
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, ctx.GetPlace(),
dout_dims, true, x, x_dims, false, &dy_tmp, dy_bd_dims, dout,
dout_dims,
false,
y,
y_dims,
false,
&dx_tmp,
dx_bd_dims,
1);
ExecuteMatMulV2<T>(ctx,
dev_ctx,
onednn_engine,
ctx.GetPlace(),
dout,
dout_dims,
true,
x,
x_dims,
false,
&dy_tmp,
dy_bd_dims,
2); 2);
} else { } else {
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, ExecuteMatMulV2<T>(ctx,
dout_dims, false, y, y_dims, true, &dx_tmp, dx_bd_dims, dev_ctx,
onednn_engine,
ctx.GetPlace(),
dout,
dout_dims,
false,
y,
y_dims,
true,
&dx_tmp,
dx_bd_dims,
1); 1);
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_dims, ExecuteMatMulV2<T>(ctx,
true, dout, dout_dims, false, &dy_tmp, dy_bd_dims, 2); dev_ctx,
onednn_engine,
ctx.GetPlace(),
x,
x_dims,
true,
dout,
dout_dims,
false,
&dy_tmp,
dy_bd_dims,
2);
} }
if (x_dims != dx_bd_dims) { if (x_dims != dx_bd_dims) {
ReduceSumForMatmulGradOutput(ctx, dev_ctx, onednn_engine, &dx_tmp, dx, ReduceSumForMatmulGradOutput(ctx,
x_dims, phi::vectorize(x->dims())); dev_ctx,
onednn_engine,
&dx_tmp,
dx,
x_dims,
phi::vectorize(x->dims()));
} else { } else {
*dx = std::move(dx_tmp); *dx = std::move(dx_tmp);
} }
if (y_dims != dy_bd_dims) { if (y_dims != dy_bd_dims) {
ReduceSumForMatmulGradOutput(ctx, dev_ctx, onednn_engine, &dy_tmp, dy, ReduceSumForMatmulGradOutput(ctx,
y_dims, phi::vectorize(y->dims())); dev_ctx,
onednn_engine,
&dy_tmp,
dy,
y_dims,
phi::vectorize(y->dims()));
} else { } else {
*dy = std::move(dy_tmp); *dy = std::move(dy_tmp);
} }
...@@ -413,10 +545,14 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> { ...@@ -413,10 +545,14 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
}; };
} // anonymous namespace } // anonymous namespace
REGISTER_OP_KERNEL(matmul_v2, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(matmul_v2,
MKLDNN,
::paddle::platform::CPUPlace,
MatMulV2MKLDNNKernel<float>, MatMulV2MKLDNNKernel<float>,
MatMulV2MKLDNNKernel<paddle::platform::bfloat16>); MatMulV2MKLDNNKernel<paddle::platform::bfloat16>);
REGISTER_OP_KERNEL(matmul_v2_grad, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(matmul_v2_grad,
MKLDNN,
::paddle::platform::CPUPlace,
MatMulV2GradMKLDNNKernel<float>, MatMulV2GradMKLDNNKernel<float>,
MatMulV2GradMKLDNNKernel<paddle::platform::bfloat16>); MatMulV2GradMKLDNNKernel<paddle::platform::bfloat16>);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册