未验证 提交 5a55f13b 编写于 作者: W wangzhen38 提交者: GitHub

【code format】Fix cpplint style 4 (#43695)

* cpplint fix 2

* cpplint fix 2

* fix cpplint style 4

* fix cpplint style 4

* fix cpplint style 4

* fix cpplint style 4
上级 75080988
......@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h"
namespace plf = paddle::platform;
......@@ -21,73 +20,174 @@ namespace inference {
namespace tensorrt {
namespace plugin {
float zero = 0;
void Ltgemm_int8_linear(
cublasLtHandle_t ltHandle, const int8_t* A, cublasLtMatrixLayout_t Adesc,
int8_t* Atransform, cublasLtMatrixLayout_t AtransformDesc, bool transA_,
const int8_t* B, cublasLtMatrixLayout_t Bdesc, int8_t* Btransform,
cublasLtMatrixLayout_t BtransformDesc, bool transB_, int8_t* C,
cublasLtMatrixLayout_t Cdesc, int8_t* Ctransform,
void Ltgemm_int8_linear(cublasLtHandle_t ltHandle,
const int8_t* A,
cublasLtMatrixLayout_t Adesc,
int8_t* Atransform,
cublasLtMatrixLayout_t AtransformDesc,
bool transA_,
const int8_t* B,
cublasLtMatrixLayout_t Bdesc,
int8_t* Btransform,
cublasLtMatrixLayout_t BtransformDesc,
bool transB_,
int8_t* C,
cublasLtMatrixLayout_t Cdesc,
int8_t* Ctransform,
cublasLtMatrixLayout_t CtransformDesc,
cublasLtMatrixTransformDesc_t transformDescT,
cublasLtMatrixTransformDesc_t transformDescN,
cublasLtMatmulDesc_t matmulDesc, void* alpha_scale, void* alpha_zero,
void* alpha_one, void* workspace, cudaStream_t stream) {
cublasLtMatmulDesc_t matmulDesc,
void* alpha_scale,
void* alpha_zero,
void* alpha_one,
void* workspace,
cudaStream_t stream) {
if (transA_) {
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
ltHandle, transformDescT, alpha_one, A, Adesc, alpha_zero, nullptr,
nullptr, Atransform, AtransformDesc, stream));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle,
transformDescT,
alpha_one,
A,
Adesc,
alpha_zero,
nullptr,
nullptr,
Atransform,
AtransformDesc,
stream));
} else {
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
ltHandle, transformDescN, alpha_one, A, Adesc, alpha_zero, nullptr,
nullptr, Atransform, AtransformDesc, stream));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle,
transformDescN,
alpha_one,
A,
Adesc,
alpha_zero,
nullptr,
nullptr,
Atransform,
AtransformDesc,
stream));
}
if (transB_) {
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
ltHandle, transformDescN, alpha_one, B, Bdesc, alpha_zero, nullptr,
nullptr, Btransform, BtransformDesc, stream));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle,
transformDescN,
alpha_one,
B,
Bdesc,
alpha_zero,
nullptr,
nullptr,
Btransform,
BtransformDesc,
stream));
} else {
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
ltHandle, transformDescT, alpha_one, B, Bdesc, alpha_zero, nullptr,
nullptr, Btransform, BtransformDesc, stream));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle,
transformDescT,
alpha_one,
B,
Bdesc,
alpha_zero,
nullptr,
nullptr,
Btransform,
BtransformDesc,
stream));
}
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(
ltHandle, matmulDesc, alpha_scale, Atransform, AtransformDesc, Btransform,
BtransformDesc, nullptr, Ctransform, CtransformDesc, Ctransform,
CtransformDesc, nullptr, workspace, 0, stream));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
ltHandle, transformDescN, alpha_one, Ctransform, CtransformDesc,
alpha_zero, nullptr, nullptr, C, Cdesc, stream));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(ltHandle,
matmulDesc,
alpha_scale,
Atransform,
AtransformDesc,
Btransform,
BtransformDesc,
nullptr,
Ctransform,
CtransformDesc,
Ctransform,
CtransformDesc,
nullptr,
workspace,
0,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(ltHandle,
transformDescN,
alpha_one,
Ctransform,
CtransformDesc,
alpha_zero,
nullptr,
nullptr,
C,
Cdesc,
stream));
}
void Ltgemm_fp32_linear(cublasLtHandle_t ltHandle, const float* A,
cublasLtMatrixLayout_t Adesc, const float* B,
cublasLtMatrixLayout_t Bdesc, float* C,
void Ltgemm_fp32_linear(cublasLtHandle_t ltHandle,
const float* A,
cublasLtMatrixLayout_t Adesc,
const float* B,
cublasLtMatrixLayout_t Bdesc,
float* C,
cublasLtMatrixLayout_t Cdesc,
cublasLtMatmulDesc_t matmulDesc, void* alpha_scale,
void* alpha_zero, void* workspace,
cublasLtMatmulDesc_t matmulDesc,
void* alpha_scale,
void* alpha_zero,
void* workspace,
cudaStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(
ltHandle, matmulDesc, alpha_scale, A, Adesc, B, Bdesc, alpha_zero, C,
Cdesc, C, Cdesc, nullptr, workspace, 0, stream));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(ltHandle,
matmulDesc,
alpha_scale,
A,
Adesc,
B,
Bdesc,
alpha_zero,
C,
Cdesc,
C,
Cdesc,
nullptr,
workspace,
0,
stream));
}
void Ltgemm_fp16_linear(cublasLtHandle_t ltHandle, const half* A,
cublasLtMatrixLayout_t Adesc, const half* B,
cublasLtMatrixLayout_t Bdesc, half* C,
void Ltgemm_fp16_linear(cublasLtHandle_t ltHandle,
const half* A,
cublasLtMatrixLayout_t Adesc,
const half* B,
cublasLtMatrixLayout_t Bdesc,
half* C,
cublasLtMatrixLayout_t Cdesc,
cublasLtMatmulDesc_t matmulDesc, void* alpha_scale,
void* alpha_zero, void* workspace,
cublasLtMatmulDesc_t matmulDesc,
void* alpha_scale,
void* alpha_zero,
void* workspace,
cudaStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(
ltHandle, matmulDesc, alpha_scale, A, Adesc, B, Bdesc, alpha_zero, C,
Cdesc, C, Cdesc, nullptr, workspace, 0, stream));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(ltHandle,
matmulDesc,
alpha_scale,
A,
Adesc,
B,
Bdesc,
alpha_zero,
C,
Cdesc,
C,
Cdesc,
nullptr,
workspace,
0,
stream));
}
nvinfer1::DataType MatmulPlugin::getOutputDataType(
int index, const nvinfer1::DataType* input_types,
int index,
const nvinfer1::DataType* input_types,
int nb_inputs) const TRT_NOEXCEPT {
return input_types[0];
}
......@@ -123,13 +223,17 @@ nvinfer1::Dims MatmulPlugin::getOutputDimensions(
}
bool MatmulPlugin::supportsFormatCombination(
int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
int32_t pos,
nvinfer1::PluginTensorDesc const* inOut,
int32_t nbInputs,
int32_t nbOutputs) const TRT_NOEXCEPT {
PADDLE_ENFORCE_EQ(nbInputs, 2,
PADDLE_ENFORCE_EQ(nbInputs,
2,
platform::errors::InvalidArgument("Must have 2 inputs, "
"but got %d input(s). ",
nbInputs));
PADDLE_ENFORCE_EQ(nbOutputs, getNbOutputs(),
PADDLE_ENFORCE_EQ(nbOutputs,
getNbOutputs(),
platform::errors::InvalidArgument("Must have 1 output, "
"but got %d output(s). ",
nbOutputs));
......@@ -192,64 +296,88 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
reinterpret_cast<void**>(&Ctransform_),
sizeof(int8_t) * ((m_ + 32 - 1) / 32 * 32) / 32 * ldctransform));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Adesc_,
cudadataTypeIO,
AopTranspose == CUBLAS_OP_N ? n_ : k_,
AopTranspose == CUBLAS_OP_N ? k_ : n_,
AopTranspose == CUBLAS_OP_N ? n_ : k_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Adesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
Adesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridea),
sizeof(stridea)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Bdesc_,
cudadataTypeIO,
BopTranspose == CUBLAS_OP_N ? k_ : m_,
BopTranspose == CUBLAS_OP_N ? m_ : k_,
BopTranspose == CUBLAS_OP_N ? k_ : m_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Bdesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
Bdesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(strideb),
sizeof(strideb)));
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Cdesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
Cdesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridec),
sizeof(stridec)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&AtransformDesc_, cudadataTypeIO, n_, k_, ldatransform));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(AtransformDesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&BtransformDesc_, cudadataTypeIO, m_, k_, ldbtransform));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
BtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(BtransformDesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
BtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL4_4R2_8C,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(BtransformDesc_,
CUBLASLT_MATRIX_LAYOUT_ORDER,
&COL4_4R2_8C,
sizeof(COL4_4R2_8C)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&CtransformDesc_, cudadataTypeIO, n_, m_, ldctransform));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(CtransformDesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
......@@ -259,23 +387,33 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescCreate(
&transformDescT_, cudaDataTypeS));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS, sizeof(cudaDataTypeS)));
transformDescT_,
CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS,
sizeof(cudaDataTypeS)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &Transpose,
transformDescT_,
CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA,
&Transpose,
sizeof(Transpose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model, sizeof(transform_model)));
transformDescT_,
CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model,
sizeof(transform_model)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescCreate(
&transformDescN_, cudaDataTypeS));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescN_, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS, sizeof(cudaDataTypeS)));
transformDescN_,
CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS,
sizeof(cudaDataTypeS)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescN_, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model, sizeof(transform_model)));
transformDescN_,
CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model,
sizeof(transform_model)));
cublasOperation_t ATranspose = CUBLAS_OP_N, BTranspose = CUBLAS_OP_T;
cublasLtPointerMode_t matmul_model =
......@@ -289,14 +427,20 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
&matmulDesc_, cudaComputeType, cudaDataTypeS));
#endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &ATranspose,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_TRANSA,
&ATranspose,
sizeof(ATranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BTranspose,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_TRANSB,
&BTranspose,
sizeof(BTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_POINTER_MODE,
&matmul_model,
sizeof(matmul_model)));
std::vector<float> alpha_tem(n_, 0);
......@@ -305,18 +449,20 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
}
PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
reinterpret_cast<void**>(&alpha_scale_), n_ * sizeof(float)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_ * sizeof(float),
cudaMemcpyAsync(alpha_scale_,
&alpha_tem[0],
n_ * sizeof(float),
cudaMemcpyHostToDevice);
float zero_tem = zero;
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(float)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpyAsync(
alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice);
float one_tem = 1;
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_one_), sizeof(float)));
cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpyAsync(
alpha_one_, &one_tem, sizeof(float), cudaMemcpyHostToDevice);
} else if (type_ == nvinfer1::DataType::kHALF) {
cudaDataType_t cudadataTypeIO = CUDA_R_16F;
cudaDataType_t cudaDataTypeS = CUDA_R_16F;
......@@ -325,41 +471,57 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
#else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F;
#endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Adesc_,
cudadataTypeIO,
AopTranspose == CUBLAS_OP_N ? n_ : k_,
AopTranspose == CUBLAS_OP_N ? k_ : n_,
AopTranspose == CUBLAS_OP_N ? n_ : k_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Adesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
Adesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridea),
sizeof(stridea)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Bdesc_,
cudadataTypeIO,
BopTranspose == CUBLAS_OP_N ? k_ : m_,
BopTranspose == CUBLAS_OP_N ? m_ : k_,
BopTranspose == CUBLAS_OP_N ? k_ : m_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Bdesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
Bdesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(strideb),
sizeof(strideb)));
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Cdesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
Cdesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridec),
sizeof(stridec)));
cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
......@@ -372,26 +534,32 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
&matmulDesc_, cudaComputeType, cudaDataTypeS));
#endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_TRANSA,
&AopTranspose,
sizeof(AopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_TRANSB,
&BopTranspose,
sizeof(BopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_POINTER_MODE,
&matmul_model,
sizeof(matmul_model)));
half alpha_tem = static_cast<half>(alpha_);
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_scale_), sizeof(half)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half),
cudaMemcpyHostToDevice);
cudaMemcpyAsync(
alpha_scale_, &alpha_tem, sizeof(half), cudaMemcpyHostToDevice);
half zero_tem = static_cast<half>(zero);
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(half)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half),
cudaMemcpyHostToDevice);
cudaMemcpyAsync(
alpha_zero_, &zero_tem, sizeof(half), cudaMemcpyHostToDevice);
} else {
cudaDataType_t cudadataTypeIO = CUDA_R_32F;
cudaDataType_t cudaDataTypeS = CUDA_R_32F;
......@@ -400,41 +568,57 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
#else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F;
#endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Adesc_,
cudadataTypeIO,
AopTranspose == CUBLAS_OP_N ? n_ : k_,
AopTranspose == CUBLAS_OP_N ? k_ : n_,
AopTranspose == CUBLAS_OP_N ? n_ : k_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Adesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
Adesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridea),
sizeof(stridea)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Bdesc_,
cudadataTypeIO,
BopTranspose == CUBLAS_OP_N ? k_ : m_,
BopTranspose == CUBLAS_OP_N ? m_ : k_,
BopTranspose == CUBLAS_OP_N ? k_ : m_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Bdesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
Bdesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(strideb),
sizeof(strideb)));
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Cdesc_,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
Cdesc_,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridec),
sizeof(stridec)));
cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
......@@ -447,32 +631,39 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
&matmulDesc_, cudaComputeType, cudaDataTypeS));
#endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_TRANSA,
&AopTranspose,
sizeof(AopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_TRANSB,
&BopTranspose,
sizeof(BopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc_,
CUBLASLT_MATMUL_DESC_POINTER_MODE,
&matmul_model,
sizeof(matmul_model)));
float alpha_tem = alpha_;
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_scale_), sizeof(float)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpyAsync(
alpha_scale_, &alpha_tem, sizeof(float), cudaMemcpyHostToDevice);
float zero_tem = zero;
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(float)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpyAsync(
alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice);
}
}
void MatmulPlugin::attachToContext(
cudnnContext* cudnnContext, cublasContext* cublasContext,
nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {
void MatmulPlugin::attachToContext(cudnnContext* cudnnContext,
cublasContext* cublasContext,
nvinfer1::IGpuAllocator* gpuAllocator)
TRT_NOEXCEPT {
dyl::cublasLtCreate(&cublas_);
}
......@@ -509,35 +700,76 @@ void MatmulPlugin::terminate() TRT_NOEXCEPT {
} */
}
int MatmulPlugin::enqueue(int batchSize, const void* const* inputs,
int MatmulPlugin::enqueue(int batchSize,
const void* const* inputs,
#if IS_TRT_VERSION_LT(8000)
void** outputs, void* workspace,
void** outputs,
void* workspace,
cudaStream_t stream) {
#else
void* const* outputs, void* workspace,
void* const* outputs,
void* workspace,
cudaStream_t stream) TRT_NOEXCEPT {
#endif
if (type_ == nvinfer1::DataType::kINT8) {
const int8_t* B = static_cast<const int8_t*>(inputs[0]);
const int8_t* A = static_cast<const int8_t*>(inputs[1]);
int8_t* C = static_cast<int8_t*>(outputs[0]);
Ltgemm_int8_linear(
cublas_, A, Adesc_, Atransform_, AtransformDesc_, transA_, B, Bdesc_,
Btransform_, BtransformDesc_, transB_, C, Cdesc_, Ctransform_,
CtransformDesc_, transformDescT_, transformDescN_, matmulDesc_,
alpha_scale_, alpha_zero_, alpha_one_, workspace, stream);
Ltgemm_int8_linear(cublas_,
A,
Adesc_,
Atransform_,
AtransformDesc_,
transA_,
B,
Bdesc_,
Btransform_,
BtransformDesc_,
transB_,
C,
Cdesc_,
Ctransform_,
CtransformDesc_,
transformDescT_,
transformDescN_,
matmulDesc_,
alpha_scale_,
alpha_zero_,
alpha_one_,
workspace,
stream);
} else if (type_ == nvinfer1::DataType::kFLOAT) {
const float* B = static_cast<const float*>(inputs[0]);
const float* A = static_cast<const float*>(inputs[1]);
float* C = static_cast<float*>(outputs[0]);
Ltgemm_fp32_linear(cublas_, A, Adesc_, B, Bdesc_, C, Cdesc_, matmulDesc_,
alpha_scale_, alpha_zero_, workspace, stream);
Ltgemm_fp32_linear(cublas_,
A,
Adesc_,
B,
Bdesc_,
C,
Cdesc_,
matmulDesc_,
alpha_scale_,
alpha_zero_,
workspace,
stream);
} else if (type_ == nvinfer1::DataType::kHALF) {
const half* B = static_cast<const half*>(inputs[0]);
const half* A = static_cast<const half*>(inputs[1]);
half* C = static_cast<half*>(outputs[0]);
Ltgemm_fp16_linear(cublas_, A, Adesc_, B, Bdesc_, C, Cdesc_, matmulDesc_,
alpha_scale_, alpha_zero_, workspace, stream);
Ltgemm_fp16_linear(cublas_,
A,
Adesc_,
B,
Bdesc_,
C,
Cdesc_,
matmulDesc_,
alpha_scale_,
alpha_zero_,
workspace,
stream);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"VarMessageToVarType:Unsupported type"));
......@@ -546,13 +778,16 @@ int MatmulPlugin::enqueue(int batchSize, const void* const* inputs,
}
nvinfer1::DataType MatmulPluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType* input_types,
int index,
const nvinfer1::DataType* input_types,
int nb_inputs) const TRT_NOEXCEPT {
return input_types[0];
}
nvinfer1::DimsExprs MatmulPluginDynamic::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
int outputIndex,
const nvinfer1::DimsExprs* inputs,
int nbInputs,
nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
nvinfer1::DimsExprs output_dims(inputs[0]);
if (transB_) {
......@@ -569,13 +804,17 @@ nvinfer1::DimsExprs MatmulPluginDynamic::getOutputDimensions(
}
bool MatmulPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
int pos,
const nvinfer1::PluginTensorDesc* inOut,
int nbInputs,
int nbOutputs) TRT_NOEXCEPT {
PADDLE_ENFORCE_EQ(nbInputs, 2,
PADDLE_ENFORCE_EQ(nbInputs,
2,
platform::errors::InvalidArgument("Must have 2 inputs, "
"but got %d input(s). ",
nbInputs));
PADDLE_ENFORCE_EQ(nbOutputs, getNbOutputs(),
PADDLE_ENFORCE_EQ(nbOutputs,
getNbOutputs(),
platform::errors::InvalidArgument("Must have 1 output, "
"but got %d output(s). ",
nbOutputs));
......@@ -591,7 +830,8 @@ bool MatmulPluginDynamic::supportsFormatCombination(
}
void MatmulPluginDynamic::configurePlugin(
const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* outputs,
int nbOutputs) TRT_NOEXCEPT {
float inscale_0 = inputs[0].desc.scale;
......@@ -632,46 +872,49 @@ void MatmulPluginDynamic::configurePlugin(
}
PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
reinterpret_cast<void**>(&alpha_scale_), n_max * sizeof(float)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_max * sizeof(float),
cudaMemcpyAsync(alpha_scale_,
&alpha_tem[0],
n_max * sizeof(float),
cudaMemcpyHostToDevice);
float zero_tem = zero;
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(float)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpyAsync(
alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice);
float one_tem = 1;
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_one_), sizeof(float)));
cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpyAsync(
alpha_one_, &one_tem, sizeof(float), cudaMemcpyHostToDevice);
} else if (type_ == nvinfer1::DataType::kHALF) {
half alpha_tem = static_cast<half>(alpha_);
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_scale_), sizeof(half)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half),
cudaMemcpyHostToDevice);
cudaMemcpyAsync(
alpha_scale_, &alpha_tem, sizeof(half), cudaMemcpyHostToDevice);
half zero_tem = static_cast<half>(zero);
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(half)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half),
cudaMemcpyHostToDevice);
cudaMemcpyAsync(
alpha_zero_, &zero_tem, sizeof(half), cudaMemcpyHostToDevice);
} else {
float alpha_tem = alpha_;
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_scale_), sizeof(float)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpyAsync(
alpha_scale_, &alpha_tem, sizeof(float), cudaMemcpyHostToDevice);
float zero_tem = zero;
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMalloc(reinterpret_cast<void**>(&alpha_zero_), sizeof(float)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpyAsync(
alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice);
}
}
void MatmulPluginDynamic::attachToContext(
cudnnContext* cudnnContext, cublasContext* cublasContext,
nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {
void MatmulPluginDynamic::attachToContext(cudnnContext* cudnnContext,
cublasContext* cublasContext,
nvinfer1::IGpuAllocator* gpuAllocator)
TRT_NOEXCEPT {
dyl::cublasLtCreate(&cublas_);
}
......@@ -710,7 +953,8 @@ void MatmulPluginDynamic::terminate() TRT_NOEXCEPT {
int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs,
void* const* outputs, void* workspace,
void* const* outputs,
void* workspace,
cudaStream_t stream) TRT_NOEXCEPT {
const auto Input0Desc = inputDesc[0];
const auto Input1Desc = inputDesc[1];
......@@ -770,64 +1014,88 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
cublasLtOrder_t COL32 = CUBLASLT_ORDER_COL32;
cublasLtOrder_t COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C;
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Adesc,
cudadataTypeIO,
AopTranspose == CUBLAS_OP_N ? n : k,
AopTranspose == CUBLAS_OP_N ? k : n,
AopTranspose == CUBLAS_OP_N ? n : k));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Adesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
Adesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridea),
sizeof(stridea)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Bdesc,
cudadataTypeIO,
BopTranspose == CUBLAS_OP_N ? k : m,
BopTranspose == CUBLAS_OP_N ? m : k,
BopTranspose == CUBLAS_OP_N ? k : m));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Bdesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
Bdesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(strideb),
sizeof(strideb)));
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Cdesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
Cdesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridec),
sizeof(stridec)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&AtransformDesc, cudadataTypeIO, n, k, ldatransform));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
AtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(AtransformDesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&BtransformDesc, cudadataTypeIO, m, k, ldbtransform));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
BtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(BtransformDesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL4_4R2_8C,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(BtransformDesc,
CUBLASLT_MATRIX_LAYOUT_ORDER,
&COL4_4R2_8C,
sizeof(COL4_4R2_8C)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&CtransformDesc, cudadataTypeIO, n, m, ldctransform));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
CtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(CtransformDesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
......@@ -837,23 +1105,33 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixTransformDescCreate(&transformDescT, cudaDataTypeS));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS, sizeof(cudaDataTypeS)));
transformDescT,
CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS,
sizeof(cudaDataTypeS)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &Transpose,
transformDescT,
CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA,
&Transpose,
sizeof(Transpose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model, sizeof(transform_model)));
transformDescT,
CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model,
sizeof(transform_model)));
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixTransformDescCreate(&transformDescN, cudaDataTypeS));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescN, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS, sizeof(cudaDataTypeS)));
transformDescN,
CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS,
sizeof(cudaDataTypeS)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescN, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model, sizeof(transform_model)));
transformDescN,
CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model,
sizeof(transform_model)));
cublasOperation_t ATranspose = CUBLAS_OP_N, BTranspose = CUBLAS_OP_T;
cublasLtPointerMode_t matmul_model =
......@@ -867,24 +1145,48 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
&matmulDesc, cudaComputeType, cudaDataTypeS));
#endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &ATranspose,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_TRANSA,
&ATranspose,
sizeof(ATranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BTranspose,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_TRANSB,
&BTranspose,
sizeof(BTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_POINTER_MODE,
&matmul_model,
sizeof(matmul_model)));
const int8_t* B = static_cast<const int8_t*>(inputs[0]);
const int8_t* A = static_cast<const int8_t*>(inputs[1]);
int8_t* C = static_cast<int8_t*>(outputs[0]);
Ltgemm_int8_linear(cublas_, A, Adesc, Atransform_, AtransformDesc, transA_,
B, Bdesc, Btransform_, BtransformDesc, transB_, C, Cdesc,
Ctransform_, CtransformDesc, transformDescT,
transformDescN, matmulDesc, alpha_scale_, alpha_zero_,
alpha_one_, workspace, stream);
Ltgemm_int8_linear(cublas_,
A,
Adesc,
Atransform_,
AtransformDesc,
transA_,
B,
Bdesc,
Btransform_,
BtransformDesc,
transB_,
C,
Cdesc,
Ctransform_,
CtransformDesc,
transformDescT,
transformDescN,
matmulDesc,
alpha_scale_,
alpha_zero_,
alpha_one_,
workspace,
stream);
} else if (type_ == nvinfer1::DataType::kHALF) {
cudaDataType_t cudadataTypeIO = CUDA_R_16F;
cudaDataType_t cudaDataTypeS = CUDA_R_16F;
......@@ -893,41 +1195,57 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
#else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F;
#endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Adesc,
cudadataTypeIO,
AopTranspose == CUBLAS_OP_N ? n : k,
AopTranspose == CUBLAS_OP_N ? k : n,
AopTranspose == CUBLAS_OP_N ? n : k));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Adesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
Adesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridea),
sizeof(stridea)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Bdesc,
cudadataTypeIO,
BopTranspose == CUBLAS_OP_N ? k : m,
BopTranspose == CUBLAS_OP_N ? m : k,
BopTranspose == CUBLAS_OP_N ? k : m));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Bdesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
Bdesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(strideb),
sizeof(strideb)));
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Cdesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
Cdesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridec),
sizeof(stridec)));
cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
......@@ -940,21 +1258,37 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
&matmulDesc, cudaComputeType, cudaDataTypeS));
#endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_TRANSA,
&AopTranspose,
sizeof(AopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_TRANSB,
&BopTranspose,
sizeof(BopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_POINTER_MODE,
&matmul_model,
sizeof(matmul_model)));
const half* B = static_cast<const half*>(inputs[0]);
const half* A = static_cast<const half*>(inputs[1]);
half* C = static_cast<half*>(outputs[0]);
Ltgemm_fp16_linear(cublas_, A, Adesc, B, Bdesc, C, Cdesc, matmulDesc,
alpha_scale_, alpha_zero_, workspace, stream);
Ltgemm_fp16_linear(cublas_,
A,
Adesc,
B,
Bdesc,
C,
Cdesc,
matmulDesc,
alpha_scale_,
alpha_zero_,
workspace,
stream);
} else {
cudaDataType_t cudadataTypeIO = CUDA_R_32F;
cudaDataType_t cudaDataTypeS = CUDA_R_32F;
......@@ -963,41 +1297,57 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
#else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F;
#endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Adesc,
cudadataTypeIO,
AopTranspose == CUBLAS_OP_N ? n : k,
AopTranspose == CUBLAS_OP_N ? k : n,
AopTranspose == CUBLAS_OP_N ? n : k));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Adesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
Adesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridea),
sizeof(stridea)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Bdesc,
cudadataTypeIO,
BopTranspose == CUBLAS_OP_N ? k : m,
BopTranspose == CUBLAS_OP_N ? m : k,
BopTranspose == CUBLAS_OP_N ? k : m));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Bdesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
Bdesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(strideb),
sizeof(strideb)));
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatrixLayoutSetAttribute(Cdesc,
CUBLASLT_MATRIX_LAYOUT_TYPE,
&cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
Cdesc,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
&(stridec),
sizeof(stridec)));
cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
......@@ -1010,21 +1360,37 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
&matmulDesc, cudaComputeType, cudaDataTypeS));
#endif
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_TRANSA,
&AopTranspose,
sizeof(AopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_TRANSB,
&BopTranspose,
sizeof(BopTranspose)));
PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
PADDLE_ENFORCE_GPU_SUCCESS(
dyl::cublasLtMatmulDescSetAttribute(matmulDesc,
CUBLASLT_MATMUL_DESC_POINTER_MODE,
&matmul_model,
sizeof(matmul_model)));
const float* B = static_cast<const float*>(inputs[0]);
const float* A = static_cast<const float*>(inputs[1]);
float* C = static_cast<float*>(outputs[0]);
Ltgemm_fp32_linear(cublas_, A, Adesc, B, Bdesc, C, Cdesc, matmulDesc,
alpha_scale_, alpha_zero_, workspace, stream);
Ltgemm_fp32_linear(cublas_,
A,
Adesc,
B,
Bdesc,
C,
Cdesc,
matmulDesc,
alpha_scale_,
alpha_zero_,
workspace,
stream);
}
return cudaGetLastError() != cudaSuccess;
}
......
......@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef POLY_UTIL_CC_
#define POLY_UTIL_CC_
......@@ -26,61 +25,62 @@ using gpc::gpc_free_polygon;
using gpc::gpc_polygon_clip;
template <class T>
void Array2PointVec(const T*& box, const size_t box_size,
std::vector<Point_<T>>& vec) {
void Array2PointVec(const T* box,
const size_t box_size,
std::vector<Point_<T>>* vec) {
size_t pts_num = box_size / 2;
vec.resize(pts_num);
(*vec).resize(pts_num);
for (size_t i = 0; i < pts_num; i++) {
vec.at(i).x = box[2 * i];
vec.at(i).y = box[2 * i + 1];
(*vec).at(i).x = box[2 * i];
(*vec).at(i).y = box[2 * i + 1];
}
}
template <class T>
void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly) {
void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly) {
size_t pts_num = box_size / 2;
poly.num_contours = 1;
poly.hole = (int*)malloc(sizeof(int));
poly.hole[0] = 0;
poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
poly.contour->num_vertices = pts_num;
poly.contour->vertex =
(*poly).num_contours = 1;
(*poly).hole = reinterpret_cast<int*>(malloc(sizeof(int)));
(*poly).hole[0] = 0;
(*poly).contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
(*poly).contour->num_vertices = pts_num;
(*poly).contour->vertex =
(gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
for (size_t i = 0; i < pts_num; ++i) {
poly.contour->vertex[i].x = box[2 * i];
poly.contour->vertex[i].y = box[2 * i + 1];
(*poly).contour->vertex[i].x = box[2 * i];
(*poly).contour->vertex[i].y = box[2 * i + 1];
}
}
template <class T>
void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon& poly) {
void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon* poly) {
int pts_num = vec.size();
poly.num_contours = 1;
poly.hole = (int*)malloc(sizeof(int));
poly.hole[0] = 0;
poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
poly.contour->num_vertices = pts_num;
poly.contour->vertex =
(*poly).num_contours = 1;
(*poly).hole = reinterpret_cast<int*>(malloc(sizeof(int)));
(*poly).hole[0] = 0;
(*poly).contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
(*poly).contour->num_vertices = pts_num;
(*poly).contour->vertex =
(gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
for (size_t i = 0; i < pts_num; ++i) {
poly.contour->vertex[i].x = vec[i].x;
poly.contour->vertex[i].y = vec[i].y;
(*poly).contour->vertex[i].x = vec[i].x;
(*poly).contour->vertex[i].y = vec[i].y;
}
}
template <class T>
void Poly2PointVec(const gpc::gpc_vertex_list& contour,
std::vector<Point_<T>>& vec) {
std::vector<Point_<T>>* vec) {
int pts_num = contour.num_vertices;
vec.resize(pts_num);
(*vec).resize(pts_num);
for (int i = 0; i < pts_num; i++) {
vec.at(i).x = contour.vertex[i].x;
vec.at(i).y = contour.vertex[i].y;
(*vec).at(i).x = contour.vertex[i].x;
(*vec).at(i).y = contour.vertex[i].y;
}
}
template <class T>
T GetContourArea(std::vector<Point_<T>>& vec) {
T GetContourArea(const std::vector<Point_<T>>& vec) {
size_t pts_num = vec.size();
if (pts_num < 3) return T(0.);
T area = T(0.);
......@@ -96,17 +96,19 @@ T PolyArea(const T* box, const size_t box_size, const bool normalized) {
// If coordinate values are is invalid
// if area size <= 0, return 0.
std::vector<Point_<T>> vec;
Array2PointVec<T>(box, box_size, vec);
Array2PointVec<T>(box, box_size, &vec);
return GetContourArea<T>(vec);
}
template <class T>
T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
T PolyOverlapArea(const T* box1,
const T* box2,
const size_t box_size,
const bool normalized) {
gpc::gpc_polygon poly1;
gpc::gpc_polygon poly2;
Array2Poly<T>(box1, box_size, poly1);
Array2Poly<T>(box2, box_size, poly2);
Array2Poly<T>(box1, box_size, &poly1);
Array2Poly<T>(box2, box_size, &poly2);
gpc::gpc_polygon respoly;
gpc::gpc_op op = gpc::GPC_INT;
gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly);
......@@ -115,7 +117,7 @@ T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
int contour_num = respoly.num_contours;
for (int i = 0; i < contour_num; ++i) {
std::vector<Point_<T>> resvec;
Poly2PointVec<T>(respoly.contour[i], resvec);
Poly2PointVec<T>(respoly.contour[i], &resvec);
// inter_area += std::fabs(cv::contourArea(resvec)) + 0.5f *
// (cv::arcLength(resvec, true));
inter_area += GetContourArea<T>(resvec);
......
......@@ -11,9 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef POLY_UTIL_H_
#define POLY_UTIL_H_
#pragma once
#include <vector>
......@@ -44,31 +42,32 @@ class Point_ {
};
template <class T>
void Array2PointVec(const T*& box, const size_t box_size,
std::vector<Point_<T>>& vec);
void Array2PointVec(const T* box,
const size_t box_size,
std::vector<Point_<T>>* vec);
template <class T>
void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly);
void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly);
template <class T>
void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon& poly);
void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon* poly);
template <class T>
void Poly2PointVec(const gpc::gpc_vertex_list& contour,
std::vector<Point_<T>>& vec);
std::vector<Point_<T>>* vec);
template <class T>
T GetContourArea(std::vector<Point_<T>>& vec);
T GetContourArea(const std::vector<Point_<T>>& vec);
template <class T>
T PolyArea(const T* box, const size_t box_size, const bool normalized);
template <class T>
T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
T PolyOverlapArea(const T* box1,
const T* box2,
const size_t box_size,
const bool normalized);
} // namespace operators
} // namespace paddle
#include "paddle/fluid/operators/detection/poly_util.cc"
#endif // POLY_UTIL_H_
......@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h"
namespace {
......@@ -49,19 +48,23 @@ static std::vector<int64_t> Transpose(const std::vector<int64_t>& x,
size_t axis_size = axis.size();
auto axis_set = std::set<int>(axis.begin(), axis.end());
PADDLE_ENFORCE_EQ(axis_set.size(), axis_size,
PADDLE_ENFORCE_EQ(axis_set.size(),
axis_size,
paddle::platform::errors::InvalidArgument(
"In an axis array, elements must be unique."));
PADDLE_ENFORCE_EQ(in_rank, axis_size,
PADDLE_ENFORCE_EQ(in_rank,
axis_size,
paddle::platform::errors::InvalidArgument(
"The input dimension's size "
"should be equal to the axis's size. "
"But received dimension is %d, "
"axis's size is %d",
in_rank, axis_size));
in_rank,
axis_size));
PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), axis_size,
PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()),
axis_size,
paddle::platform::errors::InvalidArgument(
"Axis values must be ranging from 0 to (dims - 1)."));
......@@ -85,7 +88,8 @@ std::vector<int64_t> GetInputStrides(const ExecutionContext& ctx,
auto& MatrixDimsFromVector =
input_name == "X" ? RowMatrixDimsFromVector : ColumnMatrixDimsFromVector;
phi::funcs::MatDescriptor mat_dim = phi::funcs::CreateMatrixDescriptor(
MatrixDimsFromVector(new_dims), 0,
MatrixDimsFromVector(new_dims),
0,
ctx.Attr<bool>(std::string("trans_") +
static_cast<char>(std::tolower(input_name[0]))));
......@@ -125,16 +129,27 @@ template <typename T>
void ExecuteMatMulV2(const ExecutionContext& ctx,
const MKLDNNDeviceContext& dev_ctx,
const dnnl::engine onednn_engine,
paddle::platform::Place cpu_place, const Tensor* x,
std::vector<int64_t>& x_dims, bool trans_x,
const Tensor* y, std::vector<int64_t>& y_dims,
bool trans_y, Tensor* out, std::vector<int64_t>& out_dims,
paddle::platform::Place cpu_place,
const Tensor* x,
const std::vector<int64_t>& x_dims,
bool trans_x,
const Tensor* y,
const std::vector<int64_t>& y_dims,
bool trans_y,
Tensor* out,
const std::vector<int64_t>& out_dims,
int execution_number = 0) {
std::vector<int64_t> x_strides_override = GetInputStrides(ctx, "X");
std::vector<int64_t> y_strides_override = GetInputStrides(ctx, "Y");
MatMulV2MKLDNNHandler<T> handler(onednn_engine, ctx.GetPlace(), x_dims,
trans_x, y_dims, trans_y, IsOutputFused(ctx),
x_strides_override, y_strides_override);
MatMulV2MKLDNNHandler<T> handler(onednn_engine,
ctx.GetPlace(),
x_dims,
trans_x,
y_dims,
trans_y,
IsOutputFused(ctx),
x_strides_override,
y_strides_override);
const auto src_memory_p = handler.AcquireSrcMemory(x);
const auto weights_memory_p = handler.AcquireWeightsMemory(y);
......@@ -177,44 +192,48 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> {
void CalculateMatrixDims(const ExecutionContext& ctx,
const std::vector<int64_t>& x_dims,
const std::vector<int64_t>& y_dims,
std::vector<int64_t>& x_bd_dims,
std::vector<int64_t>& y_bd_dims,
std::vector<int64_t>& out_dims, Tensor* out) const {
std::vector<int64_t>* x_bd_dims,
std::vector<int64_t>* y_bd_dims,
std::vector<int64_t>* out_dims,
Tensor* out) const {
if (x_dims.size() == 1) {
x_bd_dims[x_bd_dims.size() - 1] = x_dims[0];
(*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[0];
} else if (x_dims.size() == 2) {
x_bd_dims[x_bd_dims.size() - 1] = x_dims[1];
x_bd_dims[x_bd_dims.size() - 2] = x_dims[0];
(*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[1];
(*x_bd_dims)[(*x_bd_dims).size() - 2] = x_dims[0];
} else {
for (size_t i = 0; i < x_dims.size(); ++i) {
x_bd_dims[x_bd_dims.size() - x_dims.size() + i] = x_dims[i];
(*x_bd_dims)[(*x_bd_dims).size() - x_dims.size() + i] = x_dims[i];
}
}
if (y_dims.size() == 1) {
y_bd_dims[x_bd_dims.size() - 2] = y_dims[0];
(*y_bd_dims)[(*x_bd_dims).size() - 2] = y_dims[0];
} else if (y_dims.size() == 2) {
y_bd_dims[y_bd_dims.size() - 1] = y_dims[1];
y_bd_dims[y_bd_dims.size() - 2] = y_dims[0];
(*y_bd_dims)[(*y_bd_dims).size() - 1] = y_dims[1];
(*y_bd_dims)[(*y_bd_dims).size() - 2] = y_dims[0];
} else {
for (size_t i = 0; i < y_dims.size(); ++i) {
y_bd_dims[y_bd_dims.size() - y_dims.size() + i] = y_dims[i];
(*y_bd_dims)[(*y_bd_dims).size() - y_dims.size() + i] = y_dims[i];
}
}
if (!IsOutputFused(ctx) && x_dims.size() > 2 && y_dims.size() > 2) {
for (size_t i = 0; i < x_bd_dims.size() - 2; ++i) {
for (size_t i = 0; i < (*x_bd_dims).size() - 2; ++i) {
PADDLE_ENFORCE_EQ(
x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] == 1 ||
y_bd_dims[i] == 1,
(*x_bd_dims)[i] == (*y_bd_dims)[i] || (*x_bd_dims)[i] == 1 ||
(*y_bd_dims)[i] == 1,
true,
paddle::platform::errors::InvalidArgument(
"Tensor dimensions are incorrect for broadcasting."
"Dimensions in X and Y must be same or equal to 1, but "
"received x_dim[%d]=%d and y_dims[%d]= %d",
i, x_bd_dims[i], i, y_bd_dims[i]));
out_dims[i] = std::max(x_bd_dims[i], y_bd_dims[i]);
i,
(*x_bd_dims)[i],
i,
(*y_bd_dims)[i]));
(*out_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]);
}
out->Resize(phi::make_ddim(out_dims));
out->Resize(phi::make_ddim((*out_dims)));
}
}
......@@ -238,11 +257,20 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> {
std::vector<int64_t> x_bd_dims(ndims, 1);
std::vector<int64_t> y_bd_dims(ndims, 1);
CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, out_dims,
out);
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x,
x_bd_dims, trans_x, y, y_bd_dims, trans_y, out,
CalculateMatrixDims(
ctx, x_dims, y_dims, &x_bd_dims, &y_bd_dims, &out_dims, out);
ExecuteMatMulV2<T>(ctx,
dev_ctx,
onednn_engine,
ctx.GetPlace(),
x,
x_bd_dims,
trans_x,
y,
y_bd_dims,
trans_y,
out,
out_dims);
}
};
......@@ -253,36 +281,46 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); }
private:
void CalculateGradMatrixDims(const ExecutionContext& ctx, Tensor* dx_tmp,
void CalculateGradMatrixDims(const ExecutionContext& ctx,
Tensor* dx_tmp,
Tensor* dy_tmp,
const std::vector<int64_t>& dx_dims,
const std::vector<int64_t>& dy_dims,
std::vector<int64_t>& dx_bd_dims,
std::vector<int64_t>& dy_bd_dims) const {
std::vector<int64_t>* dx_bd_dims,
std::vector<int64_t>* dy_bd_dims) const {
for (size_t i = 0; i < dx_dims.size() - 2; ++i) {
if (dx_dims[i] != dy_dims[i]) {
if (dx_dims[i] == 1) {
dx_bd_dims[i] = dy_dims[i];
(*dx_bd_dims)[i] = dy_dims[i];
} else {
dy_bd_dims[i] = dx_dims[i];
(*dy_bd_dims)[i] = dx_dims[i];
}
}
}
dx_tmp->Resize(phi::make_ddim(dx_bd_dims));
dx_tmp->Resize(phi::make_ddim((*dx_bd_dims)));
dx_tmp->mutable_data<T>(ctx.GetPlace());
dy_tmp->Resize(phi::make_ddim(dy_bd_dims));
dy_tmp->Resize(phi::make_ddim((*dy_bd_dims)));
dy_tmp->mutable_data<T>(ctx.GetPlace());
}
void ReduceSumForMatmulGradOutput(
const ExecutionContext& ctx, const MKLDNNDeviceContext& dev_ctx,
const dnnl::engine onednn_engine, const Tensor* dx_tmp, Tensor* dx,
std::vector<int64_t>& dx_dims,
const ExecutionContext& ctx,
const MKLDNNDeviceContext& dev_ctx,
const dnnl::engine onednn_engine,
const Tensor* dx_tmp,
Tensor* dx,
const std::vector<int64_t>& dx_dims,
const std::vector<int64_t>& squeezed_dims) const {
paddle::platform::ReductionMKLDNNHandler<T> handler(
dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
ctx.GetPlace(), dx_tmp, dx, dx_dims);
dnnl::algorithm::reduction_sum,
0.0f,
0.0f,
onednn_engine,
ctx.GetPlace(),
dx_tmp,
dx,
dx_dims);
auto src_memory_p = handler.AcquireSrcMemory(dx_tmp);
auto dst_memory_p = handler.AcquireDstMemory(dx);
......@@ -326,8 +364,8 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
} else if (x_dims.size() != y_dims.size()) {
is_broadcast = true;
} else {
is_broadcast =
!std::equal(x_dims.cbegin(), x_dims.cbegin() + x_dims.size() - 2,
is_broadcast = !std::equal(x_dims.cbegin(),
x_dims.cbegin() + x_dims.size() - 2,
y_dims.cbegin());
}
......@@ -362,44 +400,138 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
std::vector<int64_t> dx_bd_dims(x_dims);
std::vector<int64_t> dy_bd_dims(y_dims);
CalculateGradMatrixDims(ctx, &dx_tmp, &dy_tmp, x_dims, y_dims, dx_bd_dims,
dy_bd_dims);
CalculateGradMatrixDims(
ctx, &dx_tmp, &dy_tmp, x_dims, y_dims, &dx_bd_dims, &dy_bd_dims);
if (trans_x && trans_y) {
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, y_dims,
true, dout, dout_dims, true, &dx_tmp, dx_bd_dims, 1);
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
dout_dims, true, x, x_dims, true, &dy_tmp, dy_bd_dims,
ExecuteMatMulV2<T>(ctx,
dev_ctx,
onednn_engine,
ctx.GetPlace(),
y,
y_dims,
true,
dout,
dout_dims,
true,
&dx_tmp,
dx_bd_dims,
1);
ExecuteMatMulV2<T>(ctx,
dev_ctx,
onednn_engine,
ctx.GetPlace(),
dout,
dout_dims,
true,
x,
x_dims,
true,
&dy_tmp,
dy_bd_dims,
2);
} else if (trans_x) {
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, y_dims,
false, dout, dout_dims, true, &dx_tmp, dx_bd_dims, 1);
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_dims,
false, dout, dout_dims, false, &dy_tmp, dy_bd_dims, 2);
ExecuteMatMulV2<T>(ctx,
dev_ctx,
onednn_engine,
ctx.GetPlace(),
y,
y_dims,
false,
dout,
dout_dims,
true,
&dx_tmp,
dx_bd_dims,
1);
ExecuteMatMulV2<T>(ctx,
dev_ctx,
onednn_engine,
ctx.GetPlace(),
x,
x_dims,
false,
dout,
dout_dims,
false,
&dy_tmp,
dy_bd_dims,
2);
} else if (trans_y) {
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
dout_dims, false, y, y_dims, false, &dx_tmp,
dx_bd_dims, 1);
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
dout_dims, true, x, x_dims, false, &dy_tmp, dy_bd_dims,
ExecuteMatMulV2<T>(ctx,
dev_ctx,
onednn_engine,
ctx.GetPlace(),
dout,
dout_dims,
false,
y,
y_dims,
false,
&dx_tmp,
dx_bd_dims,
1);
ExecuteMatMulV2<T>(ctx,
dev_ctx,
onednn_engine,
ctx.GetPlace(),
dout,
dout_dims,
true,
x,
x_dims,
false,
&dy_tmp,
dy_bd_dims,
2);
} else {
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
dout_dims, false, y, y_dims, true, &dx_tmp, dx_bd_dims,
ExecuteMatMulV2<T>(ctx,
dev_ctx,
onednn_engine,
ctx.GetPlace(),
dout,
dout_dims,
false,
y,
y_dims,
true,
&dx_tmp,
dx_bd_dims,
1);
ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_dims,
true, dout, dout_dims, false, &dy_tmp, dy_bd_dims, 2);
ExecuteMatMulV2<T>(ctx,
dev_ctx,
onednn_engine,
ctx.GetPlace(),
x,
x_dims,
true,
dout,
dout_dims,
false,
&dy_tmp,
dy_bd_dims,
2);
}
if (x_dims != dx_bd_dims) {
ReduceSumForMatmulGradOutput(ctx, dev_ctx, onednn_engine, &dx_tmp, dx,
x_dims, phi::vectorize(x->dims()));
ReduceSumForMatmulGradOutput(ctx,
dev_ctx,
onednn_engine,
&dx_tmp,
dx,
x_dims,
phi::vectorize(x->dims()));
} else {
*dx = std::move(dx_tmp);
}
if (y_dims != dy_bd_dims) {
ReduceSumForMatmulGradOutput(ctx, dev_ctx, onednn_engine, &dy_tmp, dy,
y_dims, phi::vectorize(y->dims()));
ReduceSumForMatmulGradOutput(ctx,
dev_ctx,
onednn_engine,
&dy_tmp,
dy,
y_dims,
phi::vectorize(y->dims()));
} else {
*dy = std::move(dy_tmp);
}
......@@ -413,10 +545,14 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
};
} // anonymous namespace
REGISTER_OP_KERNEL(matmul_v2, MKLDNN, ::paddle::platform::CPUPlace,
REGISTER_OP_KERNEL(matmul_v2,
MKLDNN,
::paddle::platform::CPUPlace,
MatMulV2MKLDNNKernel<float>,
MatMulV2MKLDNNKernel<paddle::platform::bfloat16>);
REGISTER_OP_KERNEL(matmul_v2_grad, MKLDNN, ::paddle::platform::CPUPlace,
REGISTER_OP_KERNEL(matmul_v2_grad,
MKLDNN,
::paddle::platform::CPUPlace,
MatMulV2GradMKLDNNKernel<float>,
MatMulV2GradMKLDNNKernel<paddle::platform::bfloat16>);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册