diff --git a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu index 1502f3913b5414ea78d1bd5e32574bc27c38dfc6..29f7c39e33a4b4b9e60e4ad148126b7d9d8defe5 100644 --- a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu @@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h" namespace plf = paddle::platform; @@ -21,73 +20,174 @@ namespace inference { namespace tensorrt { namespace plugin { float zero = 0; -void Ltgemm_int8_linear( - cublasLtHandle_t ltHandle, const int8_t* A, cublasLtMatrixLayout_t Adesc, - int8_t* Atransform, cublasLtMatrixLayout_t AtransformDesc, bool transA_, - const int8_t* B, cublasLtMatrixLayout_t Bdesc, int8_t* Btransform, - cublasLtMatrixLayout_t BtransformDesc, bool transB_, int8_t* C, - cublasLtMatrixLayout_t Cdesc, int8_t* Ctransform, - cublasLtMatrixLayout_t CtransformDesc, - cublasLtMatrixTransformDesc_t transformDescT, - cublasLtMatrixTransformDesc_t transformDescN, - cublasLtMatmulDesc_t matmulDesc, void* alpha_scale, void* alpha_zero, - void* alpha_one, void* workspace, cudaStream_t stream) { +void Ltgemm_int8_linear(cublasLtHandle_t ltHandle, + const int8_t* A, + cublasLtMatrixLayout_t Adesc, + int8_t* Atransform, + cublasLtMatrixLayout_t AtransformDesc, + bool transA_, + const int8_t* B, + cublasLtMatrixLayout_t Bdesc, + int8_t* Btransform, + cublasLtMatrixLayout_t BtransformDesc, + bool transB_, + int8_t* C, + cublasLtMatrixLayout_t Cdesc, + int8_t* Ctransform, + cublasLtMatrixLayout_t CtransformDesc, + cublasLtMatrixTransformDesc_t transformDescT, + cublasLtMatrixTransformDesc_t transformDescN, + cublasLtMatmulDesc_t matmulDesc, + void* alpha_scale, + void* alpha_zero, + void* alpha_one, + void* workspace, + cudaStream_t stream) { if (transA_) { - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( - ltHandle, transformDescT, alpha_one, A, Adesc, alpha_zero, nullptr, - nullptr, Atransform, AtransformDesc, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle, + transformDescT, + alpha_one, + A, + Adesc, + alpha_zero, + nullptr, + nullptr, + Atransform, + AtransformDesc, + stream)); } else { - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( - ltHandle, transformDescN, alpha_one, A, Adesc, alpha_zero, nullptr, - nullptr, Atransform, AtransformDesc, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle, + transformDescN, + alpha_one, + A, + Adesc, + alpha_zero, + nullptr, + nullptr, + Atransform, + AtransformDesc, + stream)); } if (transB_) { - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( - ltHandle, transformDescN, alpha_one, B, Bdesc, alpha_zero, nullptr, - nullptr, Btransform, BtransformDesc, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle, + transformDescN, + alpha_one, + B, + Bdesc, + alpha_zero, + nullptr, + nullptr, + Btransform, + BtransformDesc, + stream)); } else { - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( - ltHandle, transformDescT, alpha_one, B, Bdesc, alpha_zero, nullptr, - nullptr, Btransform, BtransformDesc, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle, + transformDescT, + alpha_one, + B, + Bdesc, + alpha_zero, + nullptr, + nullptr, + Btransform, + BtransformDesc, + stream)); } - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul( - ltHandle, matmulDesc, alpha_scale, Atransform, AtransformDesc, Btransform, - BtransformDesc, nullptr, Ctransform, CtransformDesc, Ctransform, - CtransformDesc, nullptr, workspace, 0, stream)); - - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( - ltHandle, transformDescN, alpha_one, Ctransform, CtransformDesc, - alpha_zero, nullptr, nullptr, C, Cdesc, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(ltHandle, + matmulDesc, + alpha_scale, + Atransform, + AtransformDesc, + Btransform, + BtransformDesc, + nullptr, + Ctransform, + CtransformDesc, + Ctransform, + CtransformDesc, + nullptr, + workspace, + 0, + stream)); + + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle, + transformDescN, + alpha_one, + Ctransform, + CtransformDesc, + alpha_zero, + nullptr, + nullptr, + C, + Cdesc, + stream)); } -void Ltgemm_fp32_linear(cublasLtHandle_t ltHandle, const float* A, - cublasLtMatrixLayout_t Adesc, const float* B, - cublasLtMatrixLayout_t Bdesc, float* C, +void Ltgemm_fp32_linear(cublasLtHandle_t ltHandle, + const float* A, + cublasLtMatrixLayout_t Adesc, + const float* B, + cublasLtMatrixLayout_t Bdesc, + float* C, cublasLtMatrixLayout_t Cdesc, - cublasLtMatmulDesc_t matmulDesc, void* alpha_scale, - void* alpha_zero, void* workspace, + cublasLtMatmulDesc_t matmulDesc, + void* alpha_scale, + void* alpha_zero, + void* workspace, cudaStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul( - ltHandle, matmulDesc, alpha_scale, A, Adesc, B, Bdesc, alpha_zero, C, - Cdesc, C, Cdesc, nullptr, workspace, 0, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(ltHandle, + matmulDesc, + alpha_scale, + A, + Adesc, + B, + Bdesc, + alpha_zero, + C, + Cdesc, + C, + Cdesc, + nullptr, + workspace, + 0, + stream)); } -void Ltgemm_fp16_linear(cublasLtHandle_t ltHandle, const half* A, - cublasLtMatrixLayout_t Adesc, const half* B, - cublasLtMatrixLayout_t Bdesc, half* C, +void Ltgemm_fp16_linear(cublasLtHandle_t ltHandle, + const half* A, + cublasLtMatrixLayout_t Adesc, + const half* B, + cublasLtMatrixLayout_t Bdesc, + half* C, cublasLtMatrixLayout_t Cdesc, - cublasLtMatmulDesc_t matmulDesc, void* alpha_scale, - void* alpha_zero, void* workspace, + cublasLtMatmulDesc_t matmulDesc, + void* alpha_scale, + void* alpha_zero, + void* workspace, cudaStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul( - ltHandle, matmulDesc, alpha_scale, A, Adesc, B, Bdesc, alpha_zero, C, - Cdesc, C, Cdesc, nullptr, workspace, 0, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(ltHandle, + matmulDesc, + alpha_scale, + A, + Adesc, + B, + Bdesc, + alpha_zero, + C, + Cdesc, + C, + Cdesc, + nullptr, + workspace, + 0, + stream)); } nvinfer1::DataType MatmulPlugin::getOutputDataType( - int index, const nvinfer1::DataType* input_types, + int index, + const nvinfer1::DataType* input_types, int nb_inputs) const TRT_NOEXCEPT { return input_types[0]; } @@ -123,13 +223,17 @@ nvinfer1::Dims MatmulPlugin::getOutputDimensions( } bool MatmulPlugin::supportsFormatCombination( - int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, + int32_t pos, + nvinfer1::PluginTensorDesc const* inOut, + int32_t nbInputs, int32_t nbOutputs) const TRT_NOEXCEPT { - PADDLE_ENFORCE_EQ(nbInputs, 2, + PADDLE_ENFORCE_EQ(nbInputs, + 2, platform::errors::InvalidArgument("Must have 2 inputs, " "but got %d input(s). ", nbInputs)); - PADDLE_ENFORCE_EQ(nbOutputs, getNbOutputs(), + PADDLE_ENFORCE_EQ(nbOutputs, + getNbOutputs(), platform::errors::InvalidArgument("Must have 1 output, " "but got %d output(s). ", nbOutputs)); @@ -192,65 +296,89 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, reinterpret_cast(&Ctransform_), sizeof(int8_t) * ((m_ + 32 - 1) / 32 * 32) / 32 * ldctransform)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( - &Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_, - AopTranspose == CUBLAS_OP_N ? k_ : n_, - AopTranspose == CUBLAS_OP_N ? n_ : k_)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutCreate(&Adesc_, + cudadataTypeIO, + AopTranspose == CUBLAS_OP_N ? n_ : k_, + AopTranspose == CUBLAS_OP_N ? k_ : n_, + AopTranspose == CUBLAS_OP_N ? n_ : k_)); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Adesc_, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), + Adesc_, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(stridea), sizeof(stridea))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( - &Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_, - BopTranspose == CUBLAS_OP_N ? m_ : k_, - BopTranspose == CUBLAS_OP_N ? k_ : m_)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutCreate(&Bdesc_, + cudadataTypeIO, + BopTranspose == CUBLAS_OP_N ? k_ : m_, + BopTranspose == CUBLAS_OP_N ? m_ : k_, + BopTranspose == CUBLAS_OP_N ? k_ : m_)); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Bdesc_, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), + Bdesc_, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(strideb), sizeof(strideb))); PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Cdesc_, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), + Cdesc_, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(stridec), sizeof(stridec))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &AtransformDesc_, cudadataTypeIO, n_, k_, ldatransform)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(AtransformDesc_, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &BtransformDesc_, cudadataTypeIO, m_, k_, ldbtransform)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - BtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - BtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL4_4R2_8C, - sizeof(COL4_4R2_8C))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(BtransformDesc_, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(BtransformDesc_, + CUBLASLT_MATRIX_LAYOUT_ORDER, + &COL4_4R2_8C, + sizeof(COL4_4R2_8C))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &CtransformDesc_, cudadataTypeIO, n_, m_, ldctransform)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(CtransformDesc_, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32))); @@ -259,23 +387,33 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescCreate( &transformDescT_, cudaDataTypeS)); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( - transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, - &cudaDataTypeS, sizeof(cudaDataTypeS))); + transformDescT_, + CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, + &cudaDataTypeS, + sizeof(cudaDataTypeS))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( - transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &Transpose, + transformDescT_, + CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, + &Transpose, sizeof(Transpose))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( - transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, - &transform_model, sizeof(transform_model))); + transformDescT_, + CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, + &transform_model, + sizeof(transform_model))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescCreate( &transformDescN_, cudaDataTypeS)); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( - transformDescN_, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, - &cudaDataTypeS, sizeof(cudaDataTypeS))); + transformDescN_, + CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, + &cudaDataTypeS, + sizeof(cudaDataTypeS))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( - transformDescN_, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, - &transform_model, sizeof(transform_model))); + transformDescN_, + CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, + &transform_model, + sizeof(transform_model))); cublasOperation_t ATranspose = CUBLAS_OP_N, BTranspose = CUBLAS_OP_T; cublasLtPointerMode_t matmul_model = @@ -289,15 +427,21 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, &matmulDesc_, cudaComputeType, cudaDataTypeS)); #endif - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &ATranspose, - sizeof(ATranspose))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BTranspose, - sizeof(BTranspose))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, - sizeof(matmul_model))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc_, + CUBLASLT_MATMUL_DESC_TRANSA, + &ATranspose, + sizeof(ATranspose))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc_, + CUBLASLT_MATMUL_DESC_TRANSB, + &BTranspose, + sizeof(BTranspose))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc_, + CUBLASLT_MATMUL_DESC_POINTER_MODE, + &matmul_model, + sizeof(matmul_model))); std::vector alpha_tem(n_, 0); for (int i = 0; i < n_; i++) { @@ -305,18 +449,20 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, } PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( reinterpret_cast(&alpha_scale_), n_ * sizeof(float))); - cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_ * sizeof(float), + cudaMemcpyAsync(alpha_scale_, + &alpha_tem[0], + n_ * sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc(reinterpret_cast(&alpha_zero_), sizeof(float))); - cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), - cudaMemcpyHostToDevice); + cudaMemcpyAsync( + alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); float one_tem = 1; PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc(reinterpret_cast(&alpha_one_), sizeof(float))); - cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float), - cudaMemcpyHostToDevice); + cudaMemcpyAsync( + alpha_one_, &one_tem, sizeof(float), cudaMemcpyHostToDevice); } else if (type_ == nvinfer1::DataType::kHALF) { cudaDataType_t cudadataTypeIO = CUDA_R_16F; cudaDataType_t cudaDataTypeS = CUDA_R_16F; @@ -325,41 +471,57 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, #else cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F; #endif - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( - &Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_, - AopTranspose == CUBLAS_OP_N ? k_ : n_, - AopTranspose == CUBLAS_OP_N ? n_ : k_)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutCreate(&Adesc_, + cudadataTypeIO, + AopTranspose == CUBLAS_OP_N ? n_ : k_, + AopTranspose == CUBLAS_OP_N ? k_ : n_, + AopTranspose == CUBLAS_OP_N ? n_ : k_)); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Adesc_, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), + Adesc_, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(stridea), sizeof(stridea))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( - &Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_, - BopTranspose == CUBLAS_OP_N ? m_ : k_, - BopTranspose == CUBLAS_OP_N ? k_ : m_)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutCreate(&Bdesc_, + cudadataTypeIO, + BopTranspose == CUBLAS_OP_N ? k_ : m_, + BopTranspose == CUBLAS_OP_N ? m_ : k_, + BopTranspose == CUBLAS_OP_N ? k_ : m_)); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Bdesc_, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), + Bdesc_, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(strideb), sizeof(strideb))); PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Cdesc_, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), + Cdesc_, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(stridec), sizeof(stridec))); cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE; @@ -372,26 +534,32 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, &matmulDesc_, cudaComputeType, cudaDataTypeS)); #endif - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose, - sizeof(AopTranspose))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose, - sizeof(BopTranspose))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, - sizeof(matmul_model))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc_, + CUBLASLT_MATMUL_DESC_TRANSA, + &AopTranspose, + sizeof(AopTranspose))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc_, + CUBLASLT_MATMUL_DESC_TRANSB, + &BopTranspose, + sizeof(BopTranspose))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc_, + CUBLASLT_MATMUL_DESC_POINTER_MODE, + &matmul_model, + sizeof(matmul_model))); half alpha_tem = static_cast(alpha_); PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc(reinterpret_cast(&alpha_scale_), sizeof(half))); - cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half), - cudaMemcpyHostToDevice); + cudaMemcpyAsync( + alpha_scale_, &alpha_tem, sizeof(half), cudaMemcpyHostToDevice); half zero_tem = static_cast(zero); PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc(reinterpret_cast(&alpha_zero_), sizeof(half))); - cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half), - cudaMemcpyHostToDevice); + cudaMemcpyAsync( + alpha_zero_, &zero_tem, sizeof(half), cudaMemcpyHostToDevice); } else { cudaDataType_t cudadataTypeIO = CUDA_R_32F; cudaDataType_t cudaDataTypeS = CUDA_R_32F; @@ -400,41 +568,57 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, #else cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F; #endif - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( - &Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_, - AopTranspose == CUBLAS_OP_N ? k_ : n_, - AopTranspose == CUBLAS_OP_N ? n_ : k_)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutCreate(&Adesc_, + cudadataTypeIO, + AopTranspose == CUBLAS_OP_N ? n_ : k_, + AopTranspose == CUBLAS_OP_N ? k_ : n_, + AopTranspose == CUBLAS_OP_N ? n_ : k_)); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Adesc_, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), + Adesc_, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(stridea), sizeof(stridea))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( - &Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_, - BopTranspose == CUBLAS_OP_N ? m_ : k_, - BopTranspose == CUBLAS_OP_N ? k_ : m_)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutCreate(&Bdesc_, + cudadataTypeIO, + BopTranspose == CUBLAS_OP_N ? k_ : m_, + BopTranspose == CUBLAS_OP_N ? m_ : k_, + BopTranspose == CUBLAS_OP_N ? k_ : m_)); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Bdesc_, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), + Bdesc_, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(strideb), sizeof(strideb))); PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Cdesc_, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), + Cdesc_, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(stridec), sizeof(stridec))); cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE; @@ -447,32 +631,39 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, &matmulDesc_, cudaComputeType, cudaDataTypeS)); #endif - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose, - sizeof(AopTranspose))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose, - sizeof(BopTranspose))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, - sizeof(matmul_model))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc_, + CUBLASLT_MATMUL_DESC_TRANSA, + &AopTranspose, + sizeof(AopTranspose))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc_, + CUBLASLT_MATMUL_DESC_TRANSB, + &BopTranspose, + sizeof(BopTranspose))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc_, + CUBLASLT_MATMUL_DESC_POINTER_MODE, + &matmul_model, + sizeof(matmul_model))); float alpha_tem = alpha_; PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc(reinterpret_cast(&alpha_scale_), sizeof(float))); - cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float), - cudaMemcpyHostToDevice); + cudaMemcpyAsync( + alpha_scale_, &alpha_tem, sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc(reinterpret_cast(&alpha_zero_), sizeof(float))); - cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), - cudaMemcpyHostToDevice); + cudaMemcpyAsync( + alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); } } -void MatmulPlugin::attachToContext( - cudnnContext* cudnnContext, cublasContext* cublasContext, - nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT { +void MatmulPlugin::attachToContext(cudnnContext* cudnnContext, + cublasContext* cublasContext, + nvinfer1::IGpuAllocator* gpuAllocator) + TRT_NOEXCEPT { dyl::cublasLtCreate(&cublas_); } @@ -509,35 +700,76 @@ void MatmulPlugin::terminate() TRT_NOEXCEPT { } */ } -int MatmulPlugin::enqueue(int batchSize, const void* const* inputs, +int MatmulPlugin::enqueue(int batchSize, + const void* const* inputs, #if IS_TRT_VERSION_LT(8000) - void** outputs, void* workspace, + void** outputs, + void* workspace, cudaStream_t stream) { #else - void* const* outputs, void* workspace, + void* const* outputs, + void* workspace, cudaStream_t stream) TRT_NOEXCEPT { #endif if (type_ == nvinfer1::DataType::kINT8) { const int8_t* B = static_cast(inputs[0]); const int8_t* A = static_cast(inputs[1]); int8_t* C = static_cast(outputs[0]); - Ltgemm_int8_linear( - cublas_, A, Adesc_, Atransform_, AtransformDesc_, transA_, B, Bdesc_, - Btransform_, BtransformDesc_, transB_, C, Cdesc_, Ctransform_, - CtransformDesc_, transformDescT_, transformDescN_, matmulDesc_, - alpha_scale_, alpha_zero_, alpha_one_, workspace, stream); + Ltgemm_int8_linear(cublas_, + A, + Adesc_, + Atransform_, + AtransformDesc_, + transA_, + B, + Bdesc_, + Btransform_, + BtransformDesc_, + transB_, + C, + Cdesc_, + Ctransform_, + CtransformDesc_, + transformDescT_, + transformDescN_, + matmulDesc_, + alpha_scale_, + alpha_zero_, + alpha_one_, + workspace, + stream); } else if (type_ == nvinfer1::DataType::kFLOAT) { const float* B = static_cast(inputs[0]); const float* A = static_cast(inputs[1]); float* C = static_cast(outputs[0]); - Ltgemm_fp32_linear(cublas_, A, Adesc_, B, Bdesc_, C, Cdesc_, matmulDesc_, - alpha_scale_, alpha_zero_, workspace, stream); + Ltgemm_fp32_linear(cublas_, + A, + Adesc_, + B, + Bdesc_, + C, + Cdesc_, + matmulDesc_, + alpha_scale_, + alpha_zero_, + workspace, + stream); } else if (type_ == nvinfer1::DataType::kHALF) { const half* B = static_cast(inputs[0]); const half* A = static_cast(inputs[1]); half* C = static_cast(outputs[0]); - Ltgemm_fp16_linear(cublas_, A, Adesc_, B, Bdesc_, C, Cdesc_, matmulDesc_, - alpha_scale_, alpha_zero_, workspace, stream); + Ltgemm_fp16_linear(cublas_, + A, + Adesc_, + B, + Bdesc_, + C, + Cdesc_, + matmulDesc_, + alpha_scale_, + alpha_zero_, + workspace, + stream); } else { PADDLE_THROW(platform::errors::InvalidArgument( "VarMessageToVarType:Unsupported type")); @@ -546,13 +778,16 @@ int MatmulPlugin::enqueue(int batchSize, const void* const* inputs, } nvinfer1::DataType MatmulPluginDynamic::getOutputDataType( - int index, const nvinfer1::DataType* input_types, + int index, + const nvinfer1::DataType* input_types, int nb_inputs) const TRT_NOEXCEPT { return input_types[0]; } nvinfer1::DimsExprs MatmulPluginDynamic::getOutputDimensions( - int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + int outputIndex, + const nvinfer1::DimsExprs* inputs, + int nbInputs, nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT { nvinfer1::DimsExprs output_dims(inputs[0]); if (transB_) { @@ -569,13 +804,17 @@ nvinfer1::DimsExprs MatmulPluginDynamic::getOutputDimensions( } bool MatmulPluginDynamic::supportsFormatCombination( - int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, + int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, int nbOutputs) TRT_NOEXCEPT { - PADDLE_ENFORCE_EQ(nbInputs, 2, + PADDLE_ENFORCE_EQ(nbInputs, + 2, platform::errors::InvalidArgument("Must have 2 inputs, " "but got %d input(s). ", nbInputs)); - PADDLE_ENFORCE_EQ(nbOutputs, getNbOutputs(), + PADDLE_ENFORCE_EQ(nbOutputs, + getNbOutputs(), platform::errors::InvalidArgument("Must have 1 output, " "but got %d output(s). ", nbOutputs)); @@ -591,7 +830,8 @@ bool MatmulPluginDynamic::supportsFormatCombination( } void MatmulPluginDynamic::configurePlugin( - const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* inputs, + int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) TRT_NOEXCEPT { float inscale_0 = inputs[0].desc.scale; @@ -632,46 +872,49 @@ void MatmulPluginDynamic::configurePlugin( } PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( reinterpret_cast(&alpha_scale_), n_max * sizeof(float))); - cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_max * sizeof(float), + cudaMemcpyAsync(alpha_scale_, + &alpha_tem[0], + n_max * sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc(reinterpret_cast(&alpha_zero_), sizeof(float))); - cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), - cudaMemcpyHostToDevice); + cudaMemcpyAsync( + alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); float one_tem = 1; PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc(reinterpret_cast(&alpha_one_), sizeof(float))); - cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float), - cudaMemcpyHostToDevice); + cudaMemcpyAsync( + alpha_one_, &one_tem, sizeof(float), cudaMemcpyHostToDevice); } else if (type_ == nvinfer1::DataType::kHALF) { half alpha_tem = static_cast(alpha_); PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc(reinterpret_cast(&alpha_scale_), sizeof(half))); - cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half), - cudaMemcpyHostToDevice); + cudaMemcpyAsync( + alpha_scale_, &alpha_tem, sizeof(half), cudaMemcpyHostToDevice); half zero_tem = static_cast(zero); PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc(reinterpret_cast(&alpha_zero_), sizeof(half))); - cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half), - cudaMemcpyHostToDevice); + cudaMemcpyAsync( + alpha_zero_, &zero_tem, sizeof(half), cudaMemcpyHostToDevice); } else { float alpha_tem = alpha_; PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc(reinterpret_cast(&alpha_scale_), sizeof(float))); - cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float), - cudaMemcpyHostToDevice); + cudaMemcpyAsync( + alpha_scale_, &alpha_tem, sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc(reinterpret_cast(&alpha_zero_), sizeof(float))); - cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), - cudaMemcpyHostToDevice); + cudaMemcpyAsync( + alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); } } -void MatmulPluginDynamic::attachToContext( - cudnnContext* cudnnContext, cublasContext* cublasContext, - nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT { +void MatmulPluginDynamic::attachToContext(cudnnContext* cudnnContext, + cublasContext* cublasContext, + nvinfer1::IGpuAllocator* gpuAllocator) + TRT_NOEXCEPT { dyl::cublasLtCreate(&cublas_); } @@ -710,7 +953,8 @@ void MatmulPluginDynamic::terminate() TRT_NOEXCEPT { int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, - void* const* outputs, void* workspace, + void* const* outputs, + void* workspace, cudaStream_t stream) TRT_NOEXCEPT { const auto Input0Desc = inputDesc[0]; const auto Input1Desc = inputDesc[1]; @@ -770,65 +1014,89 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, cublasLtOrder_t COL32 = CUBLASLT_ORDER_COL32; cublasLtOrder_t COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C; - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( - &Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k, - AopTranspose == CUBLAS_OP_N ? k : n, - AopTranspose == CUBLAS_OP_N ? n : k)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutCreate(&Adesc, + cudadataTypeIO, + AopTranspose == CUBLAS_OP_N ? n : k, + AopTranspose == CUBLAS_OP_N ? k : n, + AopTranspose == CUBLAS_OP_N ? n : k)); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Adesc, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), + Adesc, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(stridea), sizeof(stridea))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( - &Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m, - BopTranspose == CUBLAS_OP_N ? m : k, - BopTranspose == CUBLAS_OP_N ? k : m)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutCreate(&Bdesc, + cudadataTypeIO, + BopTranspose == CUBLAS_OP_N ? k : m, + BopTranspose == CUBLAS_OP_N ? m : k, + BopTranspose == CUBLAS_OP_N ? k : m)); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Bdesc, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), + Bdesc, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(strideb), sizeof(strideb))); PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Cdesc, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), + Cdesc, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(stridec), sizeof(stridec))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &AtransformDesc, cudadataTypeIO, n, k, ldatransform)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - AtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(AtransformDesc, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &BtransformDesc, cudadataTypeIO, m, k, ldbtransform)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - BtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL4_4R2_8C, - sizeof(COL4_4R2_8C))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(BtransformDesc, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(BtransformDesc, + CUBLASLT_MATRIX_LAYOUT_ORDER, + &COL4_4R2_8C, + sizeof(COL4_4R2_8C))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &CtransformDesc, cudadataTypeIO, n, m, ldctransform)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - CtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(CtransformDesc, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32))); @@ -837,23 +1105,33 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixTransformDescCreate(&transformDescT, cudaDataTypeS)); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( - transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, - &cudaDataTypeS, sizeof(cudaDataTypeS))); + transformDescT, + CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, + &cudaDataTypeS, + sizeof(cudaDataTypeS))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( - transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &Transpose, + transformDescT, + CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, + &Transpose, sizeof(Transpose))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( - transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, - &transform_model, sizeof(transform_model))); + transformDescT, + CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, + &transform_model, + sizeof(transform_model))); PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixTransformDescCreate(&transformDescN, cudaDataTypeS)); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( - transformDescN, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, - &cudaDataTypeS, sizeof(cudaDataTypeS))); + transformDescN, + CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, + &cudaDataTypeS, + sizeof(cudaDataTypeS))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( - transformDescN, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, - &transform_model, sizeof(transform_model))); + transformDescN, + CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, + &transform_model, + sizeof(transform_model))); cublasOperation_t ATranspose = CUBLAS_OP_N, BTranspose = CUBLAS_OP_T; cublasLtPointerMode_t matmul_model = @@ -867,24 +1145,48 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, &matmulDesc, cudaComputeType, cudaDataTypeS)); #endif - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &ATranspose, - sizeof(ATranspose))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BTranspose, - sizeof(BTranspose))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, - sizeof(matmul_model))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc, + CUBLASLT_MATMUL_DESC_TRANSA, + &ATranspose, + sizeof(ATranspose))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc, + CUBLASLT_MATMUL_DESC_TRANSB, + &BTranspose, + sizeof(BTranspose))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc, + CUBLASLT_MATMUL_DESC_POINTER_MODE, + &matmul_model, + sizeof(matmul_model))); const int8_t* B = static_cast(inputs[0]); const int8_t* A = static_cast(inputs[1]); int8_t* C = static_cast(outputs[0]); - Ltgemm_int8_linear(cublas_, A, Adesc, Atransform_, AtransformDesc, transA_, - B, Bdesc, Btransform_, BtransformDesc, transB_, C, Cdesc, - Ctransform_, CtransformDesc, transformDescT, - transformDescN, matmulDesc, alpha_scale_, alpha_zero_, - alpha_one_, workspace, stream); + Ltgemm_int8_linear(cublas_, + A, + Adesc, + Atransform_, + AtransformDesc, + transA_, + B, + Bdesc, + Btransform_, + BtransformDesc, + transB_, + C, + Cdesc, + Ctransform_, + CtransformDesc, + transformDescT, + transformDescN, + matmulDesc, + alpha_scale_, + alpha_zero_, + alpha_one_, + workspace, + stream); } else if (type_ == nvinfer1::DataType::kHALF) { cudaDataType_t cudadataTypeIO = CUDA_R_16F; cudaDataType_t cudaDataTypeS = CUDA_R_16F; @@ -893,41 +1195,57 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, #else cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F; #endif - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( - &Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k, - AopTranspose == CUBLAS_OP_N ? k : n, - AopTranspose == CUBLAS_OP_N ? n : k)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutCreate(&Adesc, + cudadataTypeIO, + AopTranspose == CUBLAS_OP_N ? n : k, + AopTranspose == CUBLAS_OP_N ? k : n, + AopTranspose == CUBLAS_OP_N ? n : k)); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Adesc, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), + Adesc, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(stridea), sizeof(stridea))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( - &Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m, - BopTranspose == CUBLAS_OP_N ? m : k, - BopTranspose == CUBLAS_OP_N ? k : m)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutCreate(&Bdesc, + cudadataTypeIO, + BopTranspose == CUBLAS_OP_N ? k : m, + BopTranspose == CUBLAS_OP_N ? m : k, + BopTranspose == CUBLAS_OP_N ? k : m)); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Bdesc, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), + Bdesc, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(strideb), sizeof(strideb))); PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Cdesc, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), + Cdesc, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(stridec), sizeof(stridec))); cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE; @@ -940,21 +1258,37 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, &matmulDesc, cudaComputeType, cudaDataTypeS)); #endif - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose, - sizeof(AopTranspose))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose, - sizeof(BopTranspose))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, - sizeof(matmul_model))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc, + CUBLASLT_MATMUL_DESC_TRANSA, + &AopTranspose, + sizeof(AopTranspose))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc, + CUBLASLT_MATMUL_DESC_TRANSB, + &BopTranspose, + sizeof(BopTranspose))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc, + CUBLASLT_MATMUL_DESC_POINTER_MODE, + &matmul_model, + sizeof(matmul_model))); const half* B = static_cast(inputs[0]); const half* A = static_cast(inputs[1]); half* C = static_cast(outputs[0]); - Ltgemm_fp16_linear(cublas_, A, Adesc, B, Bdesc, C, Cdesc, matmulDesc, - alpha_scale_, alpha_zero_, workspace, stream); + Ltgemm_fp16_linear(cublas_, + A, + Adesc, + B, + Bdesc, + C, + Cdesc, + matmulDesc, + alpha_scale_, + alpha_zero_, + workspace, + stream); } else { cudaDataType_t cudadataTypeIO = CUDA_R_32F; cudaDataType_t cudaDataTypeS = CUDA_R_32F; @@ -963,41 +1297,57 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, #else cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F; #endif - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( - &Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k, - AopTranspose == CUBLAS_OP_N ? k : n, - AopTranspose == CUBLAS_OP_N ? n : k)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutCreate(&Adesc, + cudadataTypeIO, + AopTranspose == CUBLAS_OP_N ? n : k, + AopTranspose == CUBLAS_OP_N ? k : n, + AopTranspose == CUBLAS_OP_N ? n : k)); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Adesc, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), + Adesc, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(stridea), sizeof(stridea))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( - &Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m, - BopTranspose == CUBLAS_OP_N ? m : k, - BopTranspose == CUBLAS_OP_N ? k : m)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutCreate(&Bdesc, + cudadataTypeIO, + BopTranspose == CUBLAS_OP_N ? k : m, + BopTranspose == CUBLAS_OP_N ? m : k, + BopTranspose == CUBLAS_OP_N ? k : m)); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Bdesc, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), + Bdesc, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(strideb), sizeof(strideb))); PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n)); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, - sizeof(cudadataTypeIO))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatrixLayoutSetAttribute(Cdesc, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &cudadataTypeIO, + sizeof(cudadataTypeIO))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( - Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), + Cdesc, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &(stridec), sizeof(stridec))); cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE; @@ -1010,21 +1360,37 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, &matmulDesc, cudaComputeType, cudaDataTypeS)); #endif - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose, - sizeof(AopTranspose))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose, - sizeof(BopTranspose))); - PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( - matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, - sizeof(matmul_model))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc, + CUBLASLT_MATMUL_DESC_TRANSA, + &AopTranspose, + sizeof(AopTranspose))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc, + CUBLASLT_MATMUL_DESC_TRANSB, + &BopTranspose, + sizeof(BopTranspose))); + PADDLE_ENFORCE_GPU_SUCCESS( + dyl::cublasLtMatmulDescSetAttribute(matmulDesc, + CUBLASLT_MATMUL_DESC_POINTER_MODE, + &matmul_model, + sizeof(matmul_model))); const float* B = static_cast(inputs[0]); const float* A = static_cast(inputs[1]); float* C = static_cast(outputs[0]); - Ltgemm_fp32_linear(cublas_, A, Adesc, B, Bdesc, C, Cdesc, matmulDesc, - alpha_scale_, alpha_zero_, workspace, stream); + Ltgemm_fp32_linear(cublas_, + A, + Adesc, + B, + Bdesc, + C, + Cdesc, + matmulDesc, + alpha_scale_, + alpha_zero_, + workspace, + stream); } return cudaGetLastError() != cudaSuccess; } diff --git a/paddle/fluid/operators/detection/poly_util.cc b/paddle/fluid/operators/detection/poly_util.cc index 6aa81bf1b39f7a24d2b5da0e056dac33ed387336..1bdba0e5c66c0b5b3363698e638832a0da09009f 100644 --- a/paddle/fluid/operators/detection/poly_util.cc +++ b/paddle/fluid/operators/detection/poly_util.cc @@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef POLY_UTIL_CC_ #define POLY_UTIL_CC_ @@ -26,61 +25,62 @@ using gpc::gpc_free_polygon; using gpc::gpc_polygon_clip; template -void Array2PointVec(const T*& box, const size_t box_size, - std::vector>& vec) { +void Array2PointVec(const T* box, + const size_t box_size, + std::vector>* vec) { size_t pts_num = box_size / 2; - vec.resize(pts_num); + (*vec).resize(pts_num); for (size_t i = 0; i < pts_num; i++) { - vec.at(i).x = box[2 * i]; - vec.at(i).y = box[2 * i + 1]; + (*vec).at(i).x = box[2 * i]; + (*vec).at(i).y = box[2 * i + 1]; } } template -void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly) { +void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly) { size_t pts_num = box_size / 2; - poly.num_contours = 1; - poly.hole = (int*)malloc(sizeof(int)); - poly.hole[0] = 0; - poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list)); - poly.contour->num_vertices = pts_num; - poly.contour->vertex = + (*poly).num_contours = 1; + (*poly).hole = reinterpret_cast(malloc(sizeof(int))); + (*poly).hole[0] = 0; + (*poly).contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list)); + (*poly).contour->num_vertices = pts_num; + (*poly).contour->vertex = (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num); for (size_t i = 0; i < pts_num; ++i) { - poly.contour->vertex[i].x = box[2 * i]; - poly.contour->vertex[i].y = box[2 * i + 1]; + (*poly).contour->vertex[i].x = box[2 * i]; + (*poly).contour->vertex[i].y = box[2 * i + 1]; } } template -void PointVec2Poly(const std::vector>& vec, gpc::gpc_polygon& poly) { +void PointVec2Poly(const std::vector>& vec, gpc::gpc_polygon* poly) { int pts_num = vec.size(); - poly.num_contours = 1; - poly.hole = (int*)malloc(sizeof(int)); - poly.hole[0] = 0; - poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list)); - poly.contour->num_vertices = pts_num; - poly.contour->vertex = + (*poly).num_contours = 1; + (*poly).hole = reinterpret_cast(malloc(sizeof(int))); + (*poly).hole[0] = 0; + (*poly).contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list)); + (*poly).contour->num_vertices = pts_num; + (*poly).contour->vertex = (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num); for (size_t i = 0; i < pts_num; ++i) { - poly.contour->vertex[i].x = vec[i].x; - poly.contour->vertex[i].y = vec[i].y; + (*poly).contour->vertex[i].x = vec[i].x; + (*poly).contour->vertex[i].y = vec[i].y; } } template void Poly2PointVec(const gpc::gpc_vertex_list& contour, - std::vector>& vec) { + std::vector>* vec) { int pts_num = contour.num_vertices; - vec.resize(pts_num); + (*vec).resize(pts_num); for (int i = 0; i < pts_num; i++) { - vec.at(i).x = contour.vertex[i].x; - vec.at(i).y = contour.vertex[i].y; + (*vec).at(i).x = contour.vertex[i].x; + (*vec).at(i).y = contour.vertex[i].y; } } template -T GetContourArea(std::vector>& vec) { +T GetContourArea(const std::vector>& vec) { size_t pts_num = vec.size(); if (pts_num < 3) return T(0.); T area = T(0.); @@ -96,17 +96,19 @@ T PolyArea(const T* box, const size_t box_size, const bool normalized) { // If coordinate values are is invalid // if area size <= 0, return 0. std::vector> vec; - Array2PointVec(box, box_size, vec); + Array2PointVec(box, box_size, &vec); return GetContourArea(vec); } template -T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size, +T PolyOverlapArea(const T* box1, + const T* box2, + const size_t box_size, const bool normalized) { gpc::gpc_polygon poly1; gpc::gpc_polygon poly2; - Array2Poly(box1, box_size, poly1); - Array2Poly(box2, box_size, poly2); + Array2Poly(box1, box_size, &poly1); + Array2Poly(box2, box_size, &poly2); gpc::gpc_polygon respoly; gpc::gpc_op op = gpc::GPC_INT; gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly); @@ -115,7 +117,7 @@ T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size, int contour_num = respoly.num_contours; for (int i = 0; i < contour_num; ++i) { std::vector> resvec; - Poly2PointVec(respoly.contour[i], resvec); + Poly2PointVec(respoly.contour[i], &resvec); // inter_area += std::fabs(cv::contourArea(resvec)) + 0.5f * // (cv::arcLength(resvec, true)); inter_area += GetContourArea(resvec); diff --git a/paddle/fluid/operators/detection/poly_util.h b/paddle/fluid/operators/detection/poly_util.h index cc37f00008d33b5688a7c7719e810b221ba23d30..a4eb41b691c493cfdb139f2ce60a07e4dadbc275 100644 --- a/paddle/fluid/operators/detection/poly_util.h +++ b/paddle/fluid/operators/detection/poly_util.h @@ -11,9 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#ifndef POLY_UTIL_H_ -#define POLY_UTIL_H_ +#pragma once #include @@ -44,31 +42,32 @@ class Point_ { }; template -void Array2PointVec(const T*& box, const size_t box_size, - std::vector>& vec); +void Array2PointVec(const T* box, + const size_t box_size, + std::vector>* vec); template -void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly); +void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly); template -void PointVec2Poly(const std::vector>& vec, gpc::gpc_polygon& poly); +void PointVec2Poly(const std::vector>& vec, gpc::gpc_polygon* poly); template void Poly2PointVec(const gpc::gpc_vertex_list& contour, - std::vector>& vec); + std::vector>* vec); template -T GetContourArea(std::vector>& vec); +T GetContourArea(const std::vector>& vec); template T PolyArea(const T* box, const size_t box_size, const bool normalized); template -T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size, +T PolyOverlapArea(const T* box1, + const T* box2, + const size_t box_size, const bool normalized); } // namespace operators } // namespace paddle #include "paddle/fluid/operators/detection/poly_util.cc" - -#endif // POLY_UTIL_H_ diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc index 424faf30d3a9f279eb99bde0f402665610231068..24e3a5aa69da879cd0eaf732657f25b238589a58 100644 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h" namespace { @@ -49,19 +48,23 @@ static std::vector Transpose(const std::vector& x, size_t axis_size = axis.size(); auto axis_set = std::set(axis.begin(), axis.end()); - PADDLE_ENFORCE_EQ(axis_set.size(), axis_size, + PADDLE_ENFORCE_EQ(axis_set.size(), + axis_size, paddle::platform::errors::InvalidArgument( "In an axis array, elements must be unique.")); - PADDLE_ENFORCE_EQ(in_rank, axis_size, + PADDLE_ENFORCE_EQ(in_rank, + axis_size, paddle::platform::errors::InvalidArgument( "The input dimension's size " "should be equal to the axis's size. " "But received dimension is %d, " "axis's size is %d", - in_rank, axis_size)); + in_rank, + axis_size)); - PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), axis_size, + PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), + axis_size, paddle::platform::errors::InvalidArgument( "Axis values must be ranging from 0 to (dims - 1).")); @@ -85,7 +88,8 @@ std::vector GetInputStrides(const ExecutionContext& ctx, auto& MatrixDimsFromVector = input_name == "X" ? RowMatrixDimsFromVector : ColumnMatrixDimsFromVector; phi::funcs::MatDescriptor mat_dim = phi::funcs::CreateMatrixDescriptor( - MatrixDimsFromVector(new_dims), 0, + MatrixDimsFromVector(new_dims), + 0, ctx.Attr(std::string("trans_") + static_cast(std::tolower(input_name[0])))); @@ -125,16 +129,27 @@ template void ExecuteMatMulV2(const ExecutionContext& ctx, const MKLDNNDeviceContext& dev_ctx, const dnnl::engine onednn_engine, - paddle::platform::Place cpu_place, const Tensor* x, - std::vector& x_dims, bool trans_x, - const Tensor* y, std::vector& y_dims, - bool trans_y, Tensor* out, std::vector& out_dims, + paddle::platform::Place cpu_place, + const Tensor* x, + const std::vector& x_dims, + bool trans_x, + const Tensor* y, + const std::vector& y_dims, + bool trans_y, + Tensor* out, + const std::vector& out_dims, int execution_number = 0) { std::vector x_strides_override = GetInputStrides(ctx, "X"); std::vector y_strides_override = GetInputStrides(ctx, "Y"); - MatMulV2MKLDNNHandler handler(onednn_engine, ctx.GetPlace(), x_dims, - trans_x, y_dims, trans_y, IsOutputFused(ctx), - x_strides_override, y_strides_override); + MatMulV2MKLDNNHandler handler(onednn_engine, + ctx.GetPlace(), + x_dims, + trans_x, + y_dims, + trans_y, + IsOutputFused(ctx), + x_strides_override, + y_strides_override); const auto src_memory_p = handler.AcquireSrcMemory(x); const auto weights_memory_p = handler.AcquireWeightsMemory(y); @@ -177,44 +192,48 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel { void CalculateMatrixDims(const ExecutionContext& ctx, const std::vector& x_dims, const std::vector& y_dims, - std::vector& x_bd_dims, - std::vector& y_bd_dims, - std::vector& out_dims, Tensor* out) const { + std::vector* x_bd_dims, + std::vector* y_bd_dims, + std::vector* out_dims, + Tensor* out) const { if (x_dims.size() == 1) { - x_bd_dims[x_bd_dims.size() - 1] = x_dims[0]; + (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[0]; } else if (x_dims.size() == 2) { - x_bd_dims[x_bd_dims.size() - 1] = x_dims[1]; - x_bd_dims[x_bd_dims.size() - 2] = x_dims[0]; + (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[1]; + (*x_bd_dims)[(*x_bd_dims).size() - 2] = x_dims[0]; } else { for (size_t i = 0; i < x_dims.size(); ++i) { - x_bd_dims[x_bd_dims.size() - x_dims.size() + i] = x_dims[i]; + (*x_bd_dims)[(*x_bd_dims).size() - x_dims.size() + i] = x_dims[i]; } } if (y_dims.size() == 1) { - y_bd_dims[x_bd_dims.size() - 2] = y_dims[0]; + (*y_bd_dims)[(*x_bd_dims).size() - 2] = y_dims[0]; } else if (y_dims.size() == 2) { - y_bd_dims[y_bd_dims.size() - 1] = y_dims[1]; - y_bd_dims[y_bd_dims.size() - 2] = y_dims[0]; + (*y_bd_dims)[(*y_bd_dims).size() - 1] = y_dims[1]; + (*y_bd_dims)[(*y_bd_dims).size() - 2] = y_dims[0]; } else { for (size_t i = 0; i < y_dims.size(); ++i) { - y_bd_dims[y_bd_dims.size() - y_dims.size() + i] = y_dims[i]; + (*y_bd_dims)[(*y_bd_dims).size() - y_dims.size() + i] = y_dims[i]; } } if (!IsOutputFused(ctx) && x_dims.size() > 2 && y_dims.size() > 2) { - for (size_t i = 0; i < x_bd_dims.size() - 2; ++i) { + for (size_t i = 0; i < (*x_bd_dims).size() - 2; ++i) { PADDLE_ENFORCE_EQ( - x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] == 1 || - y_bd_dims[i] == 1, + (*x_bd_dims)[i] == (*y_bd_dims)[i] || (*x_bd_dims)[i] == 1 || + (*y_bd_dims)[i] == 1, true, paddle::platform::errors::InvalidArgument( "Tensor dimensions are incorrect for broadcasting." "Dimensions in X and Y must be same or equal to 1, but " "received x_dim[%d]=%d and y_dims[%d]= %d", - i, x_bd_dims[i], i, y_bd_dims[i])); - out_dims[i] = std::max(x_bd_dims[i], y_bd_dims[i]); + i, + (*x_bd_dims)[i], + i, + (*y_bd_dims)[i])); + (*out_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]); } - out->Resize(phi::make_ddim(out_dims)); + out->Resize(phi::make_ddim((*out_dims))); } } @@ -238,11 +257,20 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel { std::vector x_bd_dims(ndims, 1); std::vector y_bd_dims(ndims, 1); - CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, out_dims, - out); - - ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, - x_bd_dims, trans_x, y, y_bd_dims, trans_y, out, + CalculateMatrixDims( + ctx, x_dims, y_dims, &x_bd_dims, &y_bd_dims, &out_dims, out); + + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + x, + x_bd_dims, + trans_x, + y, + y_bd_dims, + trans_y, + out, out_dims); } }; @@ -253,36 +281,46 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel { void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); } private: - void CalculateGradMatrixDims(const ExecutionContext& ctx, Tensor* dx_tmp, + void CalculateGradMatrixDims(const ExecutionContext& ctx, + Tensor* dx_tmp, Tensor* dy_tmp, const std::vector& dx_dims, const std::vector& dy_dims, - std::vector& dx_bd_dims, - std::vector& dy_bd_dims) const { + std::vector* dx_bd_dims, + std::vector* dy_bd_dims) const { for (size_t i = 0; i < dx_dims.size() - 2; ++i) { if (dx_dims[i] != dy_dims[i]) { if (dx_dims[i] == 1) { - dx_bd_dims[i] = dy_dims[i]; + (*dx_bd_dims)[i] = dy_dims[i]; } else { - dy_bd_dims[i] = dx_dims[i]; + (*dy_bd_dims)[i] = dx_dims[i]; } } } - dx_tmp->Resize(phi::make_ddim(dx_bd_dims)); + dx_tmp->Resize(phi::make_ddim((*dx_bd_dims))); dx_tmp->mutable_data(ctx.GetPlace()); - dy_tmp->Resize(phi::make_ddim(dy_bd_dims)); + dy_tmp->Resize(phi::make_ddim((*dy_bd_dims))); dy_tmp->mutable_data(ctx.GetPlace()); } void ReduceSumForMatmulGradOutput( - const ExecutionContext& ctx, const MKLDNNDeviceContext& dev_ctx, - const dnnl::engine onednn_engine, const Tensor* dx_tmp, Tensor* dx, - std::vector& dx_dims, + const ExecutionContext& ctx, + const MKLDNNDeviceContext& dev_ctx, + const dnnl::engine onednn_engine, + const Tensor* dx_tmp, + Tensor* dx, + const std::vector& dx_dims, const std::vector& squeezed_dims) const { paddle::platform::ReductionMKLDNNHandler handler( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, - ctx.GetPlace(), dx_tmp, dx, dx_dims); + dnnl::algorithm::reduction_sum, + 0.0f, + 0.0f, + onednn_engine, + ctx.GetPlace(), + dx_tmp, + dx, + dx_dims); auto src_memory_p = handler.AcquireSrcMemory(dx_tmp); auto dst_memory_p = handler.AcquireDstMemory(dx); @@ -326,9 +364,9 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel { } else if (x_dims.size() != y_dims.size()) { is_broadcast = true; } else { - is_broadcast = - !std::equal(x_dims.cbegin(), x_dims.cbegin() + x_dims.size() - 2, - y_dims.cbegin()); + is_broadcast = !std::equal(x_dims.cbegin(), + x_dims.cbegin() + x_dims.size() - 2, + y_dims.cbegin()); } // if no broadcasting is needed, we can simply use matmul's grad and avoid @@ -362,44 +400,138 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel { std::vector dx_bd_dims(x_dims); std::vector dy_bd_dims(y_dims); - CalculateGradMatrixDims(ctx, &dx_tmp, &dy_tmp, x_dims, y_dims, dx_bd_dims, - dy_bd_dims); + CalculateGradMatrixDims( + ctx, &dx_tmp, &dy_tmp, x_dims, y_dims, &dx_bd_dims, &dy_bd_dims); if (trans_x && trans_y) { - ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, y_dims, - true, dout, dout_dims, true, &dx_tmp, dx_bd_dims, 1); - ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, - dout_dims, true, x, x_dims, true, &dy_tmp, dy_bd_dims, + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + y, + y_dims, + true, + dout, + dout_dims, + true, + &dx_tmp, + dx_bd_dims, + 1); + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + dout, + dout_dims, + true, + x, + x_dims, + true, + &dy_tmp, + dy_bd_dims, 2); } else if (trans_x) { - ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, y_dims, - false, dout, dout_dims, true, &dx_tmp, dx_bd_dims, 1); - ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_dims, - false, dout, dout_dims, false, &dy_tmp, dy_bd_dims, 2); + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + y, + y_dims, + false, + dout, + dout_dims, + true, + &dx_tmp, + dx_bd_dims, + 1); + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + x, + x_dims, + false, + dout, + dout_dims, + false, + &dy_tmp, + dy_bd_dims, + 2); } else if (trans_y) { - ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, - dout_dims, false, y, y_dims, false, &dx_tmp, - dx_bd_dims, 1); - ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, - dout_dims, true, x, x_dims, false, &dy_tmp, dy_bd_dims, + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + dout, + dout_dims, + false, + y, + y_dims, + false, + &dx_tmp, + dx_bd_dims, + 1); + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + dout, + dout_dims, + true, + x, + x_dims, + false, + &dy_tmp, + dy_bd_dims, 2); } else { - ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, - dout_dims, false, y, y_dims, true, &dx_tmp, dx_bd_dims, + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + dout, + dout_dims, + false, + y, + y_dims, + true, + &dx_tmp, + dx_bd_dims, 1); - ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_dims, - true, dout, dout_dims, false, &dy_tmp, dy_bd_dims, 2); + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + x, + x_dims, + true, + dout, + dout_dims, + false, + &dy_tmp, + dy_bd_dims, + 2); } if (x_dims != dx_bd_dims) { - ReduceSumForMatmulGradOutput(ctx, dev_ctx, onednn_engine, &dx_tmp, dx, - x_dims, phi::vectorize(x->dims())); + ReduceSumForMatmulGradOutput(ctx, + dev_ctx, + onednn_engine, + &dx_tmp, + dx, + x_dims, + phi::vectorize(x->dims())); } else { *dx = std::move(dx_tmp); } if (y_dims != dy_bd_dims) { - ReduceSumForMatmulGradOutput(ctx, dev_ctx, onednn_engine, &dy_tmp, dy, - y_dims, phi::vectorize(y->dims())); + ReduceSumForMatmulGradOutput(ctx, + dev_ctx, + onednn_engine, + &dy_tmp, + dy, + y_dims, + phi::vectorize(y->dims())); } else { *dy = std::move(dy_tmp); } @@ -413,10 +545,14 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel { }; } // anonymous namespace -REGISTER_OP_KERNEL(matmul_v2, MKLDNN, ::paddle::platform::CPUPlace, +REGISTER_OP_KERNEL(matmul_v2, + MKLDNN, + ::paddle::platform::CPUPlace, MatMulV2MKLDNNKernel, MatMulV2MKLDNNKernel); -REGISTER_OP_KERNEL(matmul_v2_grad, MKLDNN, ::paddle::platform::CPUPlace, +REGISTER_OP_KERNEL(matmul_v2_grad, + MKLDNN, + ::paddle::platform::CPUPlace, MatMulV2GradMKLDNNKernel, MatMulV2GradMKLDNNKernel);