From 884905677825bff8b1785a19cbb60647f7b0bd20 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Fri, 29 Jul 2022 22:16:15 +0800 Subject: [PATCH] unify fluid::CUDADeviceContext and phi::GpuContext (#44723) * remove cudaDeviceContext * remove more template * fix rocm compile --- .../details/eager_deletion_op_handle.h | 6 - .../cuda_device_context_allocator.h | 5 - paddle/fluid/operators/cudnn_lstm_op.cu.cc | 7 - .../operators/fused/fused_seqpool_cvm_op.cu | 7 +- paddle/fluid/operators/gru_op.cu.cc | 7 - paddle/fluid/operators/math/cross_entropy.cu | 5 - paddle/fluid/operators/math/im2col.cu | 24 - paddle/fluid/operators/math/maxouting.cu | 6 - paddle/fluid/operators/math/sample_prob.h | 6 - .../operators/math/selected_rows_functor.cu | 161 +- .../fluid/operators/math/sequence_padding.cu | 159 -- paddle/fluid/operators/math/sequence_scale.cu | 40 - paddle/fluid/operators/math/softmax.cu | 35 - paddle/fluid/operators/math/vol2col.cu | 4 - .../sequence_ops/sequence_concat_op.cu.cc | 6 - paddle/fluid/platform/collective_helper.h | 1 - paddle/fluid/platform/device_context.cc | 5 - paddle/fluid/platform/device_context.h | 10 +- paddle/fluid/platform/transform.h | 60 - paddle/phi/kernels/funcs/blas/blas_impl.cu.h | 1343 +---------------- paddle/phi/kernels/funcs/blas/blas_impl.hip.h | 928 +----------- paddle/phi/kernels/funcs/fc_functor.cu | 4 - paddle/phi/kernels/funcs/for_range.h | 16 - paddle/phi/kernels/funcs/math_function.cu | 69 +- paddle/phi/kernels/funcs/matrix_inverse.cu.cc | 5 - paddle/phi/kernels/funcs/matrix_solve.cu | 4 - 26 files changed, 122 insertions(+), 2801 deletions(-) diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h index 6e18945bc3..a30e80b204 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.h +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -23,12 +23,6 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" -namespace paddle { -namespace platform { -class CUDADeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace framework { class GarbageCollector; diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index bb6a55e12c..662bcc401b 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -25,11 +25,6 @@ #include "paddle/fluid/platform/place.h" namespace paddle { - -namespace platform { -class CUDADeviceContext; -} // namespace platform - namespace memory { namespace allocation { diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index 75c74e5d73..bf3009e1fe 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -23,13 +23,6 @@ limitations under the License. */ #include "paddle/fluid/operators/miopen_lstm_cache.h" #endif -namespace paddle { -namespace platform { -class CUDADeviceContext; - -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu index 7b8cd9b5fc..6aba49ea33 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu @@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext #endif size_t N = static_cast(batch_size * slot_num * embedding_size); - platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N); + platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(dev_ctx, N); // first sum pool FusedSeqpoolKernelNormal<<(batch_size * slot_num * (embedding_size - cvm_offset)); - platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N); + platform::GpuLaunchConfig config = + platform::GetGpuLaunchConfig1D(dev_ctx, N); FusedCVMKernelNoCVM<<(batch_size * slot_num * embedding_size); - auto config = GetGpuLaunchConfig1D(dev_ctx, N); + auto config = platform::GetGpuLaunchConfig1D(dev_ctx, N); if (use_cvm) { // join grad FusedSeqpoolCVMGradKernelWithCVM<<::operator()( } } -template class CrossEntropyFunctor; -template class CrossEntropyFunctor; -template class CrossEntropyFunctor; - template class CrossEntropyFunctor; template class CrossEntropyFunctor; template class CrossEntropyFunctor; diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu index 083c090246..5812b5d9b2 100644 --- a/paddle/fluid/operators/math/im2col.cu +++ b/paddle/fluid/operators/math/im2col.cu @@ -308,24 +308,12 @@ class Col2ImFunctor; -template class Im2ColFunctor; template class Im2ColFunctor; template class Im2ColFunctor; -template class Col2ImFunctor; -template class Col2ImFunctor; template class Col2ImFunctor; @@ -576,12 +564,6 @@ class Col2ImFunctor; -template class Im2ColFunctor; template class Im2ColFunctor; @@ -589,12 +571,6 @@ template class Im2ColFunctor; -template class Col2ImFunctor; -template class Col2ImFunctor; template class Col2ImFunctor; diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu index b7c878a671..c84d908972 100644 --- a/paddle/fluid/operators/math/maxouting.cu +++ b/paddle/fluid/operators/math/maxouting.cu @@ -173,12 +173,6 @@ void MaxOutGradFunctor::operator()( axis); } -template class MaxOutGradFunctor; -template class MaxOutGradFunctor; - -template class MaxOutFunctor; -template class MaxOutFunctor; - template class MaxOutGradFunctor; template class MaxOutGradFunctor; diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h index ade3cab0ea..bb5c2ef979 100644 --- a/paddle/fluid/operators/math/sample_prob.h +++ b/paddle/fluid/operators/math/sample_prob.h @@ -22,12 +22,6 @@ limitations under the License. */ #include "paddle/fluid/operators/math/sampler.h" #include "paddle/phi/core/ddim.h" -namespace paddle { -namespace platform { -class CUDADeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index bc247524a9..f09578a0b1 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows, } } // namespace -template -struct SelectedRowsAddTensor { - void operator()(const platform::CUDADeviceContext& context, - const phi::SelectedRows& input1, - const framework::Tensor& input2, - framework::Tensor* output) { - auto in1_height = input1.height(); - auto in2_dims = input2.dims(); - auto out_dims = output->dims(); - PADDLE_ENFORCE_EQ( - in1_height, - in2_dims[0], - platform::errors::InvalidArgument( - "The two inputs height must be equal." - "But received first input height = [%d], first input height = [%d]", - in1_height, - in2_dims[0])); - PADDLE_ENFORCE_EQ( - in1_height, - out_dims[0], - platform::errors::InvalidArgument( - "The input and output height must be equal." - "But received input height = [%d], output height = [%d]", - in1_height, - out_dims[0])); - - auto& in1_value = input1.value(); - auto& in1_rows = input1.rows(); - - int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ( - in1_row_numel, - input2.numel() / in1_height, - platform::errors::InvalidArgument( - "The two inputs width must be equal." - "But received first input width = [%d], second input width = [%d]", - in1_row_numel, - input2.numel() / in1_height)); - PADDLE_ENFORCE_EQ( - in1_row_numel, - output->numel() / in1_height, - platform::errors::InvalidArgument( - "The input and output width must be equal." - "But received input width = [%d], output width = [%d]", - in1_row_numel, - output->numel() / in1_height)); - - auto* in1_data = in1_value.data(); - auto* in2_data = input2.data(); - auto* out_data = output->data(); - - phi::funcs::SetConstant functor; - functor(context, output, static_cast(0)); - - const int block_size = 256; - dim3 threads(block_size, 1); - dim3 grid(in1_rows.size(), 1); - paddle::framework::MixVector mixv_in1_rows(&in1_rows); - SelectedRowsAddTensorKernel - <<>>( - in1_data, - mixv_in1_rows.CUDAData(context.GetPlace()), - out_data, - in1_row_numel); - - auto out_eigen = framework::EigenVector::Flatten(*output); - auto in2_eigen = framework::EigenVector::Flatten(input2); - out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen; - } -}; - template struct SelectedRowsAddTensor { void operator()(const phi::GPUContext& context, @@ -275,12 +204,6 @@ struct SelectedRowsAddTensor { } }; -template struct SelectedRowsAddTensor; -template struct SelectedRowsAddTensor; -template struct SelectedRowsAdd; -template struct SelectedRowsAddTensor; - template struct SelectedRowsAddTensor; template struct SelectedRowsAddTensor; template struct SelectedRowsAdd; @@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows, } } // namespace -template -struct SelectedRowsAddToTensor { - void operator()(const platform::CUDADeviceContext& context, - const phi::SelectedRows& input1, - framework::Tensor* input2) { - auto in1_height = input1.height(); - auto in2_dims = input2->dims(); - PADDLE_ENFORCE_EQ( - in1_height, - in2_dims[0], - platform::errors::InvalidArgument("The two inputs height must be equal." - "But received first input height = " - "[%d], second input height = [%d]", - in1_height, - in2_dims[0])); - - auto& in1_value = input1.value(); - auto& in1_rows = input1.rows(); - - int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ( - in1_row_numel, - input2->numel() / in1_height, - platform::errors::InvalidArgument( - "The two inputs width must be equal." - "But received first input width = [%d], second input width = [%d]", - in1_row_numel, - input2->numel() / in1_height)); - - auto* in1_data = in1_value.data(); - auto* in2_data = input2->data(); - const int block_size = 256; - dim3 threads(block_size, 1); - dim3 grid(in1_rows.size(), 1); - paddle::framework::MixVector mixv_in1_rows(&in1_rows); - SelectedRowsAddToTensorKernel - <<>>( - in1_data, - mixv_in1_rows.CUDAData(context.GetPlace()), - in2_data, - in1_row_numel); - } -}; - template struct SelectedRowsAddToTensor { void operator()(const phi::GPUContext& context, @@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor { } }; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; @@ -625,34 +498,6 @@ struct MergeAddImpl { } }; -template -struct MergeAdd { - // unary functor, merge by adding duplicated rows in - // the input SelectedRows object. - phi::SelectedRows operator()(const platform::CUDADeviceContext& context, - const phi::SelectedRows& input, - const bool sorted_result) { - return MergeAddImpl()( - context, input, sorted_result); - } - - void operator()(const platform::CUDADeviceContext& context, - const phi::SelectedRows& input, - phi::SelectedRows* output, - const bool sorted_result) { - MergeAddImpl()( - context, input, output, sorted_result); - } - - void operator()(const platform::CUDADeviceContext& context, - const std::vector& inputs, - phi::SelectedRows* output, - const bool sorted_result) { - MergeAddImpl()( - context, inputs, output, sorted_result); - } -}; - template struct MergeAdd { // unary functor, merge by adding duplicated rows in @@ -678,10 +523,8 @@ struct MergeAdd { } }; -#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \ - template struct MergeAddImpl; \ - template struct MergeAddImpl; \ - template struct MergeAdd; \ +#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \ + template struct MergeAddImpl; \ template struct MergeAdd; TEMPLATE_SPECIALIZED_FOR_MERGEADD(float) diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu index eb573a2b5f..8b0156af47 100644 --- a/paddle/fluid/operators/math/sequence_padding.cu +++ b/paddle/fluid/operators/math/sequence_padding.cu @@ -57,88 +57,6 @@ __global__ void SequencePaddingKernel(T* dst, } } -template -class PaddingLoDTensorFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::LoDTensor& seq_tensor, - framework::LoDTensor* pad_tensor, - const framework::LoDTensor& pad_value, - int pad_seq_len = -1, - int lod_level = 0, - bool norm_by_times = false, - const PadLayout layout = kBatchLengthWidth) { - auto seq_lod = seq_tensor.lod(); - auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level]; - const auto& seq_tensor_dims = seq_tensor.dims(); - const auto& pad_tensor_dims = pad_tensor->dims(); - int max_seq_len = MaximumSequenceLength(seq_offsets); - if (pad_seq_len == -1) { - pad_seq_len = max_seq_len; - } - PADDLE_ENFORCE_GE( - pad_seq_len, - max_seq_len, - platform::errors::InvalidArgument( - "The pad_seq_len must be equal to or greater than the " - "original max sequence length. Expected %ld >= %ld, but got %ld < " - "%ld. Please check the input value.", - pad_seq_len, - max_seq_len, - pad_seq_len, - max_seq_len)); - int step_width = seq_tensor.numel() / seq_tensor_dims[0]; - int seq_num = seq_offsets.size() - 1; - - CheckDims(seq_tensor_dims, - pad_tensor_dims, - seq_offsets, - pad_seq_len, - step_width, - layout); - PADDLE_ENFORCE_EQ( - pad_value.numel() == 1 || pad_value.numel() == step_width, - true, - platform::errors::InvalidArgument( - "The numel of 'pad_value' can only be 1 or be equal to " - "the 'step_width', but got %ld != 1 and %ld. Please check the " - "input value.", - pad_value.numel(), - step_width)); - - const int kBlockSize = 512; - - /* At least use 32 threads to copy sequence_width elements, - * and at least 8 elements for each thread. - */ - size_t block_dim_x = - std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); - size_t block_dim_y = kBlockSize / block_dim_x; - dim3 threads(block_dim_x, block_dim_y); - - size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y; - size_t grid_dim_y = seq_num; - dim3 grid(grid_dim_x, grid_dim_y); - - const T* seq_data = seq_tensor.data(); - T* pad_data = pad_tensor->data(); - const T* pad_value_data = pad_value.data(); - - paddle::framework::MixVector mix_vector_seq_offsets(&seq_offsets); - SequencePaddingKernel<<>>( - pad_data, - seq_data, - pad_value_data, - pad_value.numel() == 1, - mix_vector_seq_offsets.CUDAData(context.GetPlace()), - seq_num, - pad_seq_len, - step_width, - norm_by_times, - layout); - } -}; - template class PaddingLoDTensorFunctor { public: @@ -221,73 +139,6 @@ class PaddingLoDTensorFunctor { } }; -template -class UnpaddingLoDTensorFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::LoDTensor& pad_tensor, - framework::LoDTensor* seq_tensor, - int pad_seq_len = -1, - int lod_level = 0, - bool norm_by_times = false, - const PadLayout layout = kBatchLengthWidth) { - auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level]; - const auto& seq_tensor_dims = seq_tensor->dims(); - const auto& pad_tensor_dims = pad_tensor.dims(); - int max_seq_len = MaximumSequenceLength(seq_offsets); - if (pad_seq_len == -1) { - pad_seq_len = max_seq_len; - } - int step_width = seq_tensor->numel() / seq_tensor_dims[0]; - int seq_num = seq_offsets.size() - 1; - - CheckDims(seq_tensor_dims, - pad_tensor_dims, - seq_offsets, - pad_seq_len, - step_width, - layout); - /* - if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) { - paddle::framework::TensorCopy(pad_tensor, context.GetPlace(), context, - seq_tensor); - seq_tensor->Resize(seq_tensor_dims); - return; - } - */ - - const int kBlockSize = 512; - - /* At least use 32 threads to copy sequence_width elements, - * and at least 8 elements for each thread. - */ - size_t block_dim_x = - std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); - size_t block_dim_y = kBlockSize / block_dim_x; - dim3 threads(block_dim_x, block_dim_y); - - size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y; - size_t grid_dim_y = seq_num; - dim3 grid(grid_dim_x, grid_dim_y); - - const T* pad_data = pad_tensor.data(); - T* seq_data = seq_tensor->data(); - - paddle::framework::MixVector mixv_seq_offsets(&seq_offsets); - SequencePaddingKernel<<>>( - seq_data, - pad_data, - nullptr, - false, - mixv_seq_offsets.CUDAData(context.GetPlace()), - seq_num, - pad_seq_len, - step_width, - norm_by_times, - layout); - } -}; - template class UnpaddingLoDTensorFunctor { public: @@ -355,16 +206,6 @@ class UnpaddingLoDTensorFunctor { } }; -template class PaddingLoDTensorFunctor; -template class PaddingLoDTensorFunctor; -template class PaddingLoDTensorFunctor; -template class PaddingLoDTensorFunctor; - -template class UnpaddingLoDTensorFunctor; -template class UnpaddingLoDTensorFunctor; -template class UnpaddingLoDTensorFunctor; -template class UnpaddingLoDTensorFunctor; - template class PaddingLoDTensorFunctor; template class PaddingLoDTensorFunctor; template class PaddingLoDTensorFunctor; diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu index 4fb8cc0ef7..1130a86838 100644 --- a/paddle/fluid/operators/math/sequence_scale.cu +++ b/paddle/fluid/operators/math/sequence_scale.cu @@ -35,43 +35,6 @@ __global__ void SequenceScaleKernel(T* seq, } } -template -class ScaleLoDTensorFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const T* scales, - framework::LoDTensor* seq) { - const size_t level = 0; - auto lod = seq->lod(); - const size_t num_seq = lod[level].size() - 1; - const size_t seq_width = seq->numel() / seq->dims()[0]; - auto abs_offset_lod = framework::ToAbsOffset(lod); - T* seq_data = seq->mutable_data(context.GetPlace()); - paddle::framework::MixVector mix_vector(&(abs_offset_lod[level])); - -#ifdef PADDLE_WITH_HIP - hipLaunchKernelGGL( - HIP_KERNEL_NAME(SequenceScaleKernel), - dim3(num_seq), - dim3(PADDLE_CUDA_NUM_THREADS), - 0, - context.stream(), - seq_data, - mix_vector.CUDAMutableData(context.GetPlace()), - scales, - seq_width); -#else - SequenceScaleKernel - <<>>( - seq_data, - mix_vector.CUDAMutableData(context.GetPlace()), - scales, - seq_width); -#endif - mix_vector.CopyToCPU(); - } -}; - template class ScaleLoDTensorFunctor { public: @@ -109,9 +72,6 @@ class ScaleLoDTensorFunctor { } }; -template class ScaleLoDTensorFunctor; -template class ScaleLoDTensorFunctor; - template class ScaleLoDTensorFunctor; template class ScaleLoDTensorFunctor; diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index e5bcca8dab..47621883fd 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -141,56 +141,21 @@ void SoftmaxGradCUDNNFunctor::operator()( #endif } -template class SoftmaxCUDNNFunctor; -template class SoftmaxCUDNNFunctor; -template class SoftmaxGradCUDNNFunctor; -template class SoftmaxGradCUDNNFunctor; template class SoftmaxCUDNNFunctor; template class SoftmaxCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; #if CUDNN_VERSION_MIN(8, 1, 0) -template class SoftmaxCUDNNFunctor; -template class SoftmaxGradCUDNNFunctor; template class SoftmaxCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; #endif // MIOPEN do not support double #ifndef PADDLE_WITH_HIP -template class SoftmaxCUDNNFunctor; -template class SoftmaxGradCUDNNFunctor; template class SoftmaxCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; #endif -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxGradFunctor; -template class SoftmaxGradFunctor; -template class SoftmaxGradFunctor; -template class SoftmaxGradFunctor; - template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu index 52dacbc080..90c2fcf6e2 100644 --- a/paddle/fluid/operators/math/vol2col.cu +++ b/paddle/fluid/operators/math/vol2col.cu @@ -417,13 +417,9 @@ void Col2VolFunctor::operator()( } // }; -template class Vol2ColFunctor; -template class Vol2ColFunctor; template class Vol2ColFunctor; template class Vol2ColFunctor; -template class Col2VolFunctor; -template class Col2VolFunctor; template class Col2VolFunctor; template class Col2VolFunctor; diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc index 4856e38011..f2117a2f09 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc @@ -16,12 +16,6 @@ #include "paddle/fluid/framework/op_registry.h" -namespace paddle { -namespace platform { -class CUDADeviceContext; -} // namespace platform -} // namespace paddle - REGISTER_OP_CUDA_KERNEL( sequence_concat, paddle::operators::SeqConcatKernel { class CudnnWorkspaceHandle; class EigenCudaStreamDevice; -class CUDADeviceContext : public phi::GPUContext { - public: - explicit CUDADeviceContext(CUDAPlace place); - virtual ~CUDADeviceContext(); - - private: - int place_holder_; // TO BE REMOVED - DISABLE_COPY_AND_ASSIGN(CUDADeviceContext); -}; +using CUDADeviceContext = phi::GPUContext; class CudnnWorkspaceHandle { public: diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h index 575415ef89..fc39fa33ff 100644 --- a/paddle/fluid/platform/transform.h +++ b/paddle/fluid/platform/transform.h @@ -96,66 +96,6 @@ struct Transform { }; #if defined(__NVCC__) || defined(__HIPCC__) -template <> -struct Transform { - template - void operator()(const platform::CUDADeviceContext& context, - InputIter first, - InputIter last, - OutputIter result, - UnaryOperation op) { - auto place = context.GetPlace(); - PADDLE_ENFORCE_EQ(is_gpu_place(place), - true, - platform::errors::PreconditionNotMet( - "The CUDA Transform must be used in GPU place.")); -#ifdef __HIPCC__ - thrust::transform(thrust::hip::par.on(context.stream()), - details::CastToCUDATransformIterator(first), - details::CastToCUDATransformIterator(last), - details::CastToCUDATransformIterator(result), - op); -#else - thrust::transform(thrust::cuda::par.on(context.stream()), - details::CastToCUDATransformIterator(first), - details::CastToCUDATransformIterator(last), - details::CastToCUDATransformIterator(result), - op); -#endif - } - - template - void operator()(const platform::CUDADeviceContext& context, - InputIter1 first1, - InputIter1 last1, - InputIter2 first2, - OutputIter result, - BinaryOperation op) { - auto place = context.GetPlace(); - PADDLE_ENFORCE_EQ(is_gpu_place(place), - true, - platform::errors::PreconditionNotMet( - "The CUDA Transform must be used in GPU place.")); -#ifdef __HIPCC__ - thrust::transform(thrust::hip::par.on(context.stream()), - details::CastToCUDATransformIterator(first1), - details::CastToCUDATransformIterator(last1), - details::CastToCUDATransformIterator(first2), - details::CastToCUDATransformIterator(result), - op); -#else - thrust::transform(thrust::cuda::par.on(context.stream()), - details::CastToCUDATransformIterator(first1), - details::CastToCUDATransformIterator(last1), - details::CastToCUDATransformIterator(first2), - details::CastToCUDATransformIterator(result), - op); -#endif - } -}; template <> struct Transform { diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h index 3e197a18f9..e4f3dbf6a7 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h @@ -66,58 +66,6 @@ struct CUBlas { #endif } - // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. - // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode - template - static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const float *alpha, - const void *A, - cudaDataType_t Atype, - int lda, - const void *B, - cudaDataType_t Btype, - int ldb, - const float *beta, - void *C, - cudaDataType_t Ctype, - int ldc) { -// Because the gcc 4.8 doesn't expand template parameter pack that -// appears in a lambda-expression, I can not use template parameter pack -// here. -#if CUDA_VERSION >= 8000 - VLOG(5) << "use_tensor_op_math: " - << (dev_ctx->tensor_core_available() ? "True" : "False"); - dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cublasSgemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc)); - }); -#else - PADDLE_THROW(phi::errors::Unimplemented( - "cublasSgemmEx is not supported on cuda <= 7.5")); -#endif - } - // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode template @@ -366,66 +314,6 @@ struct CUBlas { #endif } - // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. - // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode - template - static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const void *alpha, - const void *A, - cudaDataType_t Atype, - int lda, - const void *B, - cudaDataType_t Btype, - int ldb, - const void *beta, - void *C, - cudaDataType_t Ctype, - int ldc, - cudaDataType_t computeType) { -#if CUDA_VERSION >= 8000 - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; -#if CUDA_VERSION >= 9000 - bool use_tensor_op_math = dev_ctx->tensor_core_available(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); -#endif // CUDA_VERSION >= 9000 - - dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cublasGemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - computeType, - algo)); - }); -#else - PADDLE_THROW(phi::errors::Unimplemented( - "cublasGemmEx is not supported on cuda <= 7.5")); -#endif - } - // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode template @@ -636,66 +524,6 @@ struct CUBlas> { ldb)); } - // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. - // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode - template - static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const void *alpha, - const void *A, - cudaDataType_t Atype, - int lda, - const void *B, - cudaDataType_t Btype, - int ldb, - const void *beta, - void *C, - cudaDataType_t Ctype, - int ldc, - cudaDataType_t computeType) { -#if CUDA_VERSION >= 8000 - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; -#if CUDA_VERSION >= 9000 - bool use_tensor_op_math = dev_ctx->tensor_core_available(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); -#endif // CUDA_VERSION >= 9000 - - dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cublasGemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - computeType, - algo)); - }); -#else - PADDLE_THROW(phi::errors::Unimplemented( - "cublasGemmEx is not supported on cuda <= 7.5")); -#endif - } - // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode template @@ -965,66 +793,6 @@ struct CUBlas> { batch_size)); } - // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. - // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode - template - static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const void *alpha, - const void *A, - cudaDataType_t Atype, - int lda, - const void *B, - cudaDataType_t Btype, - int ldb, - const void *beta, - void *C, - cudaDataType_t Ctype, - int ldc, - cudaDataType_t computeType) { -#if CUDA_VERSION >= 8000 - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; -#if CUDA_VERSION >= 9000 - bool use_tensor_op_math = dev_ctx->tensor_core_available(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); -#endif // CUDA_VERSION >= 9000 - - dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cublasGemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - computeType, - algo)); - }); -#else - PADDLE_THROW(phi::errors::Unimplemented( - "cublasGemmEx is not supported on cuda <= 7.5")); -#endif - } - // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode template @@ -1088,16 +856,16 @@ struct CUBlas> { template <> template -void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T *A, - const T *B, - T beta, - T *C) const { +void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + T alpha, + const T *A, + const T *B, + T beta, + T *C) const { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -1109,8 +877,7 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, #if CUDA_VERSION >= 8000 if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { - auto &cuda_ctx = - const_cast(context_); + auto &cuda_ctx = const_cast(context_); CUBlas::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, @@ -1152,151 +919,6 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 8000 } -template <> -template -void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T *A, - const T *B, - T beta, - T *C) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - -#if CUDA_VERSION >= 8000 - if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { - auto &cuda_ctx = const_cast(context_); - CUBlas::GEMM_EX(&cuda_ctx, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - CUDA_R_32F, - ldb, - A, - CUDA_R_32F, - lda, - &beta, - C, - CUDA_R_32F, - N); - } else { -#endif // CUDA_VERSION >= 8000 - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - N); - }); - -#if CUDA_VERSION >= 8000 - } -#endif // CUDA_VERSION >= 8000 -} - -template <> -template <> -inline void Blas::GEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::float16 alpha, - const phi::dtype::float16 *A, - const phi::dtype::float16 *B, - phi::dtype::float16 beta, - phi::dtype::float16 *C) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - - // TODO(kexinzhao): add processing code for compute capability < 53 case - PADDLE_ENFORCE_GE( - context_.GetComputeCapability(), - 53, - phi::errors::InvalidArgument( - "cublas fp16 gemm requires GPU compute capability >= 53," - "but received %d", - context_.GetComputeCapability())); - - float h_alpha = static_cast(alpha); - float h_beta = static_cast(beta); - -#if CUDA_VERSION >= 8000 - // cublasHgemm does true FP16 computation which is slow for non-Volta - // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: - // input/output in fp16, computation in fp32, which can also be accelerated - // using tensor cores in volta GPUs. - auto &cuda_ctx = const_cast(context_); - CUBlas::GEMM_EX(&cuda_ctx, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - B, - CUDA_R_16F, - ldb, - A, - CUDA_R_16F, - lda, - &h_beta, - C, - CUDA_R_16F, - N, - CUDA_R_32F); -#else - // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - h_B, - ldb, - h_A, - lda, - &h_beta, - h_C, - N); - }); -#endif // CUDA_VERSION >= 8000 -} - template <> template <> inline void Blas::GEMM(CBLAS_TRANSPOSE transA, @@ -1376,77 +998,6 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 8000 } -template <> -template <> -inline void Blas::GEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 *A, - const phi::dtype::bfloat16 *B, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 *C) const { -#if CUDA_VERSION >= 11000 - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - - // TODO(kexinzhao): add processing code for compute capability < 53 case - PADDLE_ENFORCE_GE( - context_.GetComputeCapability(), - 80, - phi::errors::InvalidArgument( - "cublas fp16 gemm requires GPU compute capability >= 80," - "but received %d", - context_.GetComputeCapability())); - - float h_alpha = static_cast(alpha); - float h_beta = static_cast(beta); - - cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; - bool use_tensor_op_math = context_.tensor_core_available(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); - context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cublasGemmEx(handle, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - B, - CUDA_R_16BF, - ldb, - A, - CUDA_R_16BF, - lda, - &h_beta, - C, - CUDA_R_16BF, - N, - CUDA_R_32F, - algo)); - }); -#else - // raise error - PADDLE_THROW(phi::errors::Unimplemented( - "cublasGemmEx with bfloat16 is not supported on cuda <= 11")); - -#endif // CUDA_VERSION >= 11000 -} - template <> template <> inline void Blas::GEMM(CBLAS_TRANSPOSE transA, @@ -1517,87 +1068,6 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 11000 } -template <> -template <> -inline void Blas::GEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::complex alpha, - const phi::dtype::complex *A, - const phi::dtype::complex *B, - phi::dtype::complex beta, - phi::dtype::complex *C) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - - // TODO(kexinzhao): add processing code for compute capability < 53 case - PADDLE_ENFORCE_GE( - context_.GetComputeCapability(), - 53, - phi::errors::InvalidArgument( - "cublas complex64 gemm requires GPU compute capability >= 53," - "but received %d", - context_.GetComputeCapability())); - - thrust::complex c_alpha = - thrust::complex(alpha.real, alpha.imag); - thrust::complex c_beta = thrust::complex(beta.real, beta.imag); - -#if CUDA_VERSION >= 8000 - // cublasHgemm does true FP16 computation which is slow for non-Volta - // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: - // input/output in fp16, computation in fp32, which can also be accelerated - // using tensor cores in volta GPUs. - auto &cuda_ctx = const_cast(context_); - CUBlas>::GEMM_EX(&cuda_ctx, - cuTransB, - cuTransA, - N, - M, - K, - &c_alpha, - B, - CUDA_C_32F, - ldb, - A, - CUDA_C_32F, - lda, - &c_beta, - C, - CUDA_C_32F, - N, - CUDA_C_32F); -#else - // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas>::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &c_alpha, - h_B, - ldb, - h_A, - lda, - &c_beta, - h_C, - N); - }); -#endif // CUDA_VERSION >= 8000 -} - template <> template <> inline void Blas::GEMM(CBLAS_TRANSPOSE transA, @@ -1680,17 +1150,16 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, template <> template <> -inline void Blas::GEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::complex alpha, - const phi::dtype::complex *A, - const phi::dtype::complex *B, - phi::dtype::complex beta, - phi::dtype::complex *C) const { +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::complex alpha, + const phi::dtype::complex *A, + const phi::dtype::complex *B, + phi::dtype::complex beta, + phi::dtype::complex *C) const { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -1719,7 +1188,7 @@ inline void Blas::GEMM( // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: // input/output in fp16, computation in fp32, which can also be accelerated // using tensor cores in volta GPUs. - auto &cuda_ctx = const_cast(context_); + auto &cuda_ctx = const_cast(context_); CUBlas>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, @@ -1760,153 +1229,6 @@ inline void Blas::GEMM( #endif // CUDA_VERSION >= 8000 } -template <> -template <> -inline void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::complex alpha, - const phi::dtype::complex *A, - const phi::dtype::complex *B, - phi::dtype::complex beta, - phi::dtype::complex *C) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - - // TODO(kexinzhao): add processing code for compute capability < 53 case - PADDLE_ENFORCE_GE( - context_.GetComputeCapability(), - 53, - phi::errors::InvalidArgument( - "cublas complex128 gemm requires GPU compute capability >= 53," - "but received %d", - context_.GetComputeCapability())); - - thrust::complex c_alpha = - thrust::complex(alpha.real, alpha.imag); - thrust::complex c_beta = - thrust::complex(beta.real, beta.imag); - -#if CUDA_VERSION >= 8000 - // cublasHgemm does true FP16 computation which is slow for non-Volta - // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: - // input/output in fp16, computation in fp32, which can also be accelerated - // using tensor cores in volta GPUs. - auto &cuda_ctx = const_cast(context_); - CUBlas>::GEMM_EX(&cuda_ctx, - cuTransB, - cuTransA, - N, - M, - K, - &c_alpha, - B, - CUDA_C_64F, - ldb, - A, - CUDA_C_64F, - lda, - &c_beta, - C, - CUDA_C_64F, - N, - CUDA_C_64F); -#else - // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas>::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &c_alpha, - h_B, - ldb, - h_A, - lda, - &c_beta, - h_C, - N); - }); -#endif // CUDA_VERSION >= 8000 -} - -template <> -template -void Blas::GEMM(bool transA, - bool transB, - int M, - int N, - int K, - T alpha, - const T *A, - int lda, - const T *B, - int ldb, - T beta, - T *C, - int ldc) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - -#if CUDA_VERSION >= 8000 - if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { - auto &cuda_ctx = - const_cast(context_); - CUBlas::GEMM_EX(&cuda_ctx, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - CUDA_R_32F, - ldb, - A, - CUDA_R_32F, - lda, - &beta, - C, - CUDA_R_32F, - ldc); - } else { -#endif // CUDA_VERSION >= 8000 - - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - ldc); - }); - -#if CUDA_VERSION >= 8000 - } -#endif // CUDA_VERSION >= 8000 -} - template <> template void Blas::GEMM(bool transA, @@ -1972,45 +1294,6 @@ void Blas::GEMM(bool transA, #endif // CUDA_VERSION >= 8000 } -template <> -template <> -inline void Blas::GEMM( - bool transA, - bool transB, - int M, - int N, - int K, - phi::dtype::float16 alpha, - const phi::dtype::float16 *A, - int lda, - const phi::dtype::float16 *B, - int ldb, - phi::dtype::float16 beta, - phi::dtype::float16 *C, - int ldc) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - ldc); - }); -} - template <> template <> inline void Blas::GEMM(bool transA, @@ -2049,17 +1332,6 @@ inline void Blas::GEMM(bool transA, }); } -template <> -template -void Blas::AXPY(int n, - T alpha, - const T *x, - T *y) const { - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); - }); -} - template <> template void Blas::AXPY(int n, T alpha, const T *x, T *y) const { @@ -2068,15 +1340,6 @@ void Blas::AXPY(int n, T alpha, const T *x, T *y) const { }); } -template <> -template -void Blas::SCAL(int n, - const T alpha, - T *x) const { - context_.CublasCall( - [&](cublasHandle_t handle) { CUBlas::SCAL(handle, n, &alpha, x, 1); }); -} - template <> template void Blas::SCAL(int n, const T alpha, T *x) const { @@ -2086,247 +1349,67 @@ void Blas::SCAL(int n, const T alpha, T *x) const { template <> template -void Blas::VCOPY(int n, - const T *x, - T *y) const { - context_.CublasCall( - [&](cublasHandle_t handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }); -} - -template <> -template -void Blas::VCOPY(int n, const T *x, T *y) const { - context_.CublasCall( - [&](cublasHandle_t handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }); -} - -template <> -template -void Blas::GEMV(bool trans_a, - int M, - int N, - T alpha, - const T *A, - const T *B, - T beta, - T *C) const { - cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); - }); -} - -template <> -template -void Blas::GEMV(bool trans_a, - int M, - int N, - T alpha, - const T *A, - const T *B, - T beta, - T *C) const { - cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); - }); -} - -template <> -template <> -inline void Blas::GEMV( - bool trans_a, - int M, - int N, - phi::dtype::float16 alpha, - const phi::dtype::float16 *A, - const phi::dtype::float16 *B, - phi::dtype::float16 beta, - phi::dtype::float16 *C) const { - // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it. - if (trans_a) { - this->template GEMM( - CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C); - } else { - this->template GEMM( - CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C); - } -} - -template <> -template <> -inline void Blas::GEMV(bool trans_a, - int M, - int N, - phi::dtype::float16 alpha, - const phi::dtype::float16 *A, - const phi::dtype::float16 *B, - phi::dtype::float16 beta, - phi::dtype::float16 *C) const { - // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it. - if (trans_a) { - this->template GEMM( - CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C); - } else { - this->template GEMM( - CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C); - } -} - -template <> -template <> -inline void Blas::GEMV( - bool trans_a, - int M, - int N, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 *A, - const phi::dtype::bfloat16 *B, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 *C) const { - // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve - // it. - if (trans_a) { - this->template GEMM( - CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C); - } else { - this->template GEMM( - CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C); - } -} - -template <> -template <> -inline void Blas::GEMV(bool trans_a, - int M, - int N, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 *A, - const phi::dtype::bfloat16 *B, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 *C) const { - // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve - // it. - if (trans_a) { - this->template GEMM( - CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C); - } else { - this->template GEMM( - CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C); - } -} - -template <> -template -void Blas::BatchedGEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T *A, - const T *B, - T beta, - T *C, - int batchCount, - int64_t strideA, - int64_t strideB) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - const int64_t strideC = M * N; - -#if CUDA_VERSION >= 9010 - if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same::value)) || - std::is_same::value) { - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = context_.tensor_core_available(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); - VLOG(4) << "use_half_precision_compute_type: " - << FLAGS_gemm_use_half_precision_compute_type; - - auto fp = std::is_same::value ? CUDA_R_32F : CUDA_R_16F; - cudaDataType_t compute_type = fp; - - float h_alpha = static_cast(alpha); - float h_beta = static_cast(beta); - void *a = static_cast(&h_alpha); - void *b = static_cast(&h_beta); - // set ComputeType as CUDA_R_32F for fp16, for better accuracy - if (FLAGS_gemm_use_half_precision_compute_type == true && - std::is_same::value) { - a = static_cast(&alpha); - b = static_cast(&beta); - compute_type = CUDA_R_16F; - } - - // set ComputeType as CUDA_R_32F for fp16 and fp32, for better accuracy - context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cublasGemmStridedBatchedEx(handle, - cuTransB, - cuTransA, - N, - M, - K, - a, - B, - fp, - ldb, - strideB, - A, - fp, - lda, - strideA, - b, - C, - fp, - ldc, - strideC, - batchCount, - compute_type, - algo)); - }); - } else { -#endif // CUDA_VERSION >= 9010 +void Blas::VCOPY(int n, const T *x, T *y) const { + context_.CublasCall( + [&](cublasHandle_t handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }); +} - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM_STRIDED_BATCH(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - strideB, - A, - lda, - strideA, - &beta, - C, - ldc, - strideC, - batchCount); - }); +template <> +template +void Blas::GEMV(bool trans_a, + int M, + int N, + T alpha, + const T *A, + const T *B, + T beta, + T *C) const { + cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; -#if CUDA_VERSION >= 9010 + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); + }); +} + +template <> +template <> +inline void Blas::GEMV(bool trans_a, + int M, + int N, + phi::dtype::float16 alpha, + const phi::dtype::float16 *A, + const phi::dtype::float16 *B, + phi::dtype::float16 beta, + phi::dtype::float16 *C) const { + // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it. + if (trans_a) { + this->template GEMM( + CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C); + } else { + this->template GEMM( + CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C); + } +} + +template <> +template <> +inline void Blas::GEMV(bool trans_a, + int M, + int N, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 *A, + const phi::dtype::bfloat16 *B, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 *C) const { + // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve + // it. + if (trans_a) { + this->template GEMM( + CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C); + } else { + this->template GEMM( + CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C); } -#endif // CUDA_VERSION >= 9010 } template <> @@ -2438,78 +1521,6 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 9010 } -template <> -template <> -inline void Blas::BatchedGEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 *A, - const phi::dtype::bfloat16 *B, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 *C, - int batchCount, - int64_t strideA, - int64_t strideB) const { -#if CUDA_VERSION >= 11000 - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - const int64_t strideC = M * N; - float h_alpha = static_cast(alpha); - float h_beta = static_cast(beta); - - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = context_.tensor_core_available(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); - - context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cublasGemmStridedBatchedEx( - handle, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - B, - CUDA_R_16BF, - ldb, - strideB, - A, - CUDA_R_16BF, - lda, - strideA, - &h_beta, - C, - CUDA_R_16BF, - ldc, - strideC, - batchCount, - CUBLAS_COMPUTE_32F, - algo)); - }); -#else - // raise error - PADDLE_THROW(phi::errors::Unimplemented( - "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= " - "11")); -#endif // CUDA_VERSION >= 11000 -} - template <> template <> inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -2582,26 +1593,6 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 11000 } -template <> -template -void Blas::BatchedGEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T **A, - const T **B, - T beta, - T **C, - int batchCount) const { - for (int k = 0; k < batchCount; ++k) { - this->template GEMM( - transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]); - } -} - template <> template void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -2621,26 +1612,6 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, } } -template <> -template <> -inline void Blas::BatchedGEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::float16 alpha, - const phi::dtype::float16 **A, - const phi::dtype::float16 **B, - phi::dtype::float16 beta, - phi::dtype::float16 **C, - int batchCount) const { - for (int k = 0; k < batchCount; ++k) { - this->template GEMM( - transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]); - } -} - template <> template <> inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -2660,26 +1631,6 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, } } -template <> -template <> -inline void Blas::BatchedGEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 **A, - const phi::dtype::bfloat16 **B, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 **C, - int batchCount) const { - for (int k = 0; k < batchCount; ++k) { - this->template GEMM( - transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]); - } -} - template <> template <> inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -2699,37 +1650,6 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, } } -template <> -template -void Blas::TRSM(CBLAS_SIDE side, - CBLAS_UPLO uplo, - CBLAS_TRANSPOSE transA, - CBLAS_DIAG diag, - int M, - int N, - T alpha, - const T *A, - int lda, - T *B, - int ldb) const { - // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'` - // where ' stands for transpose - cublasSideMode_t cuSide = - (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT; - cublasFillMode_t cuUplo = - (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; - // use CUBLAS_OP_C (conjugate transpose) for complex - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasDiagType_t cuDiag = - (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; - - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::TRSM( - handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb); - }); -} - template <> template void Blas::TRSM(CBLAS_SIDE side, @@ -2761,15 +1681,6 @@ void Blas::TRSM(CBLAS_SIDE side, }); } -template <> -template -void Blas::BatchedGETRF( - int n, T **a, int *ipiv, int *info, int batch_size) const { - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size); - }); -} - template <> template void Blas::BatchedGETRF( @@ -2779,25 +1690,6 @@ void Blas::BatchedGETRF( }); } -template <> -template -void Blas::BatchedGETRI( - int n, const T **a, const int *ipiv, T **a_inv, int *info, int batch_size) - const { - PADDLE_ENFORCE_NE( - a_inv, - a, - phi::errors::InvalidArgument( - "cuBLAS fuction 'cublasgetrfBatched' cannot be executed " - "in-place. The memory space of output matrix (address: %p) cannot " - "overlap memory space of input matrix (address: %p).", - a_inv, - a)); - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size); - }); -} - template <> template void Blas::BatchedGETRI(int n, @@ -2820,15 +1712,6 @@ void Blas::BatchedGETRI(int n, }); } -template <> -template -void Blas::BatchedMatInv( - int n, const T **a, T **a_inv, int *info, int batch_size) const { - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size); - }); -} - template <> template void Blas::BatchedMatInv( @@ -2838,28 +1721,6 @@ void Blas::BatchedMatInv( }); } -template <> -template -void Blas::BatchedGETRS( - CBLAS_TRANSPOSE trans, - int n, - int nrhs, - const T **a, - int lda, - int *ipiv, - T **b, - int ldb, - int *info, - int batch_size) const { - // use CUBLAS_OP_C (conjugate transpose) for complex - cublasOperation_t cuTrans = - (trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GETRS_BATCH( - handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size); - }); -} - template <> template void Blas::BatchedGETRS(CBLAS_TRANSPOSE trans, @@ -2881,50 +1742,6 @@ void Blas::BatchedGETRS(CBLAS_TRANSPOSE trans, }); } -template <> -template -void Blas::BatchedTRSM( - CBLAS_SIDE side, - CBLAS_UPLO uplo, - CBLAS_TRANSPOSE transA, - CBLAS_DIAG diag, - int M, - int N, - T alpha, - const T **A, - int lda, - T **B, - int ldb, - int batch_size) const { - // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'` - // where ' stands for transpose - cublasSideMode_t cuSide = - (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT; - cublasFillMode_t cuUplo = - (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; - // use CUBLAS_OP_C (conjugate transpose) for complex - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasDiagType_t cuDiag = - (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; - - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::TRSM_BATCH(handle, - cuSide, - cuUplo, - cuTransA, - cuDiag, - N, - M, - &alpha, - A, - lda, - B, - ldb, - batch_size); - }); -} - template <> template void Blas::BatchedTRSM(CBLAS_SIDE side, diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h index 1108d2fbca..e322fba39a 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h @@ -257,54 +257,6 @@ struct CUBlas { // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode template - static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx, - rocblas_operation transa, - rocblas_operation transb, - int m, - int n, - int k, - const void *alpha, - const void *A, - rocblas_datatype Atype, - int lda, - const void *B, - rocblas_datatype Btype, - int ldb, - const void *beta, - void *C, - rocblas_datatype Ctype, - int ldc, - rocblas_datatype computeType) { - rocblas_gemm_algo algo = rocblas_gemm_algo_standard; - dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_gemm_ex(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - C, - Ctype, - ldc, - computeType, - algo, - 0, - 0)); - }); - } - template static void GEMM_EX(phi::GPUContext *dev_ctx, rocblas_operation transa, rocblas_operation transb, @@ -474,54 +426,6 @@ struct CUBlas> { // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode template - static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx, - rocblas_operation transa, - rocblas_operation transb, - int m, - int n, - int k, - const void *alpha, - const void *A, - rocblas_datatype Atype, - int lda, - const void *B, - rocblas_datatype Btype, - int ldb, - const void *beta, - void *C, - rocblas_datatype Ctype, - int ldc, - rocblas_datatype computeType) { - rocblas_gemm_algo algo = rocblas_gemm_algo_standard; - dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_gemm_ex(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - C, - Ctype, - ldc, - computeType, - algo, - 0, - 0)); - }); - } - template static void GEMM_EX(phi::GPUContext *dev_ctx, rocblas_operation transa, rocblas_operation transb, @@ -692,54 +596,6 @@ struct CUBlas> { // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode template - static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx, - rocblas_operation transa, - rocblas_operation transb, - int m, - int n, - int k, - const void *alpha, - const void *A, - rocblas_datatype Atype, - int lda, - const void *B, - rocblas_datatype Btype, - int ldb, - const void *beta, - void *C, - rocblas_datatype Ctype, - int ldc, - rocblas_datatype computeType) { - rocblas_gemm_algo algo = rocblas_gemm_algo_standard; - dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_gemm_ex(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - C, - Ctype, - ldc, - computeType, - algo, - 0, - 0)); - }); - } - template static void GEMM_EX(phi::GPUContext *dev_ctx, rocblas_operation transa, rocblas_operation transb, @@ -789,45 +645,6 @@ struct CUBlas> { } }; -template <> -template -void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T *A, - const T *B, - T beta, - T *C) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - rocblas_operation cuTransA = (transA == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - rocblas_operation cuTransB = (transB == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - context_.CublasCall([&](rocblas_handle handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - N); - }); -} template <> template void Blas::GEMM(CBLAS_TRANSPOSE transA, @@ -868,62 +685,6 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, }); } -template <> -template <> -inline void Blas::GEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::float16 alpha, - const phi::dtype::float16 *A, - const phi::dtype::float16 *B, - phi::dtype::float16 beta, - phi::dtype::float16 *C) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - rocblas_operation cuTransA = (transA == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - rocblas_operation cuTransB = (transB == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - - // TODO(kexinzhao): add processing code for compute capability < 53 case - PADDLE_ENFORCE_GE( - context_.GetComputeCapability(), - 53, - phi::errors::InvalidArgument( - "cublas fp16 gemm requires GPU compute capability >= 53," - "but received %d", - context_.GetComputeCapability())); - - float h_alpha = static_cast(alpha); - float h_beta = static_cast(beta); - - auto &cuda_ctx = const_cast(context_); - CUBlas::GEMM_EX(&cuda_ctx, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - B, - rocblas_datatype_f16_r, - ldb, - A, - rocblas_datatype_f16_r, - lda, - &h_beta, - C, - rocblas_datatype_f16_r, - N, - rocblas_datatype_f32_r); -} template <> template <> inline void Blas::GEMM(CBLAS_TRANSPOSE transA, @@ -982,17 +743,16 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, template <> template <> -inline void Blas::GEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 *A, - const phi::dtype::bfloat16 *B, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 *C) const { +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 *A, + const phi::dtype::bfloat16 *B, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 *C) const { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -1052,11 +812,11 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, int M, int N, int K, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 *A, - const phi::dtype::bfloat16 *B, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 *C) const { + phi::dtype::complex alpha, + const phi::dtype::complex *A, + const phi::dtype::complex *B, + phi::dtype::complex beta, + phi::dtype::complex *C) const { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -1067,140 +827,19 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, rocblas_operation cuTransB = (transB == CblasNoTrans) ? rocblas_operation_none : rocblas_operation_transpose; - // TODO(zhiqiu): 80 has the same meaning for rocm and cuda? + + // TODO(kexinzhao): add processing code for compute capability < 53 case PADDLE_ENFORCE_GE( context_.GetComputeCapability(), - 80, + 53, phi::errors::InvalidArgument( - "rocblas fp16 gemm requires GPU compute capability >= 80," + "cublas complex64 gemm requires GPU compute capability >= 53," "but received %d", context_.GetComputeCapability())); - float h_alpha = static_cast(alpha); - float h_beta = static_cast(beta); - rocblas_gemm_algo algo = rocblas_gemm_algo_standard; - - context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_gemm_ex(handle, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - B, - rocblas_datatype_bf16_r, - ldb, - A, - rocblas_datatype_bf16_r, - lda, - &h_beta, - C, - rocblas_datatype_bf16_r, - N, - C, - rocblas_datatype_bf16_r, - N, - rocblas_datatype_f32_r, - algo, - 0, - 0)); - }); -} - -template <> -template <> -inline void Blas::GEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::complex alpha, - const phi::dtype::complex *A, - const phi::dtype::complex *B, - phi::dtype::complex beta, - phi::dtype::complex *C) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - rocblas_operation cuTransA = (transA == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - rocblas_operation cuTransB = (transB == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - - // TODO(kexinzhao): add processing code for compute capability < 53 case - PADDLE_ENFORCE_GE( - context_.GetComputeCapability(), - 53, - phi::errors::InvalidArgument( - "cublas complex64 gemm requires GPU compute capability >= 53," - "but received %d", - context_.GetComputeCapability())); - - thrust::complex c_alpha = - thrust::complex(alpha.real, alpha.imag); - thrust::complex c_beta = thrust::complex(beta.real, beta.imag); - - auto &cuda_ctx = const_cast(context_); - CUBlas>::GEMM_EX(&cuda_ctx, - cuTransB, - cuTransA, - N, - M, - K, - &c_alpha, - B, - rocblas_datatype_f32_c, - ldb, - A, - rocblas_datatype_f32_c, - lda, - &c_beta, - C, - rocblas_datatype_f32_c, - N, - rocblas_datatype_f32_c); -} -template <> -template <> -inline void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::complex alpha, - const phi::dtype::complex *A, - const phi::dtype::complex *B, - phi::dtype::complex beta, - phi::dtype::complex *C) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - rocblas_operation cuTransA = (transA == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - rocblas_operation cuTransB = (transB == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - - // TODO(kexinzhao): add processing code for compute capability < 53 case - PADDLE_ENFORCE_GE( - context_.GetComputeCapability(), - 53, - phi::errors::InvalidArgument( - "cublas complex64 gemm requires GPU compute capability >= 53," - "but received %d", - context_.GetComputeCapability())); - - thrust::complex c_alpha = - thrust::complex(alpha.real, alpha.imag); - thrust::complex c_beta = thrust::complex(beta.real, beta.imag); + thrust::complex c_alpha = + thrust::complex(alpha.real, alpha.imag); + thrust::complex c_beta = thrust::complex(beta.real, beta.imag); auto &cuda_ctx = const_cast(context_); CUBlas>::GEMM_EX(&cuda_ctx, @@ -1223,64 +862,6 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, rocblas_datatype_f32_c); } -template <> -template <> -inline void Blas::GEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::complex alpha, - const phi::dtype::complex *A, - const phi::dtype::complex *B, - phi::dtype::complex beta, - phi::dtype::complex *C) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - rocblas_operation cuTransA = (transA == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - rocblas_operation cuTransB = (transB == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - - // TODO(kexinzhao): add processing code for compute capability < 53 case - PADDLE_ENFORCE_GE( - context_.GetComputeCapability(), - 53, - phi::errors::InvalidArgument( - "cublas complex128 gemm requires GPU compute capability >= 53," - "but received %d", - context_.GetComputeCapability())); - - thrust::complex c_alpha = - thrust::complex(alpha.real, alpha.imag); - thrust::complex c_beta = - thrust::complex(beta.real, beta.imag); - - auto &cuda_ctx = const_cast(context_); - CUBlas>::GEMM_EX(&cuda_ctx, - cuTransB, - cuTransA, - N, - M, - K, - &c_alpha, - B, - rocblas_datatype_f64_c, - ldb, - A, - rocblas_datatype_f64_c, - lda, - &c_beta, - C, - rocblas_datatype_f64_c, - N, - rocblas_datatype_f64_c); -} template <> template <> inline void Blas::GEMM(CBLAS_TRANSPOSE transA, @@ -1339,44 +920,6 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, rocblas_datatype_f64_c); } -template <> -template -void Blas::GEMM(bool transA, - bool transB, - int M, - int N, - int K, - T alpha, - const T *A, - int lda, - const T *B, - int ldb, - T beta, - T *C, - int ldc) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - rocblas_operation cuTransA = - transA ? rocblas_operation_transpose : rocblas_operation_none; - rocblas_operation cuTransB = - transB ? rocblas_operation_transpose : rocblas_operation_none; - context_.CublasCall([&](rocblas_handle handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - ldc); - }); -} template <> template void Blas::GEMM(bool transA, @@ -1416,46 +959,6 @@ void Blas::GEMM(bool transA, }); } -template <> -template <> -inline void Blas::GEMM( - bool transA, - bool transB, - int M, - int N, - int K, - phi::dtype::float16 alpha, - const phi::dtype::float16 *A, - int lda, - const phi::dtype::float16 *B, - int ldb, - phi::dtype::float16 beta, - phi::dtype::float16 *C, - int ldc) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - rocblas_operation cuTransA = - transA ? rocblas_operation_transpose : rocblas_operation_none; - rocblas_operation cuTransB = - transB ? rocblas_operation_transpose : rocblas_operation_none; - - context_.CublasCall([&](rocblas_handle handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - ldc); - }); -} template <> template <> inline void Blas::GEMM(bool transA, @@ -1496,16 +999,6 @@ inline void Blas::GEMM(bool transA, }); } -template <> -template -void Blas::AXPY(int n, - T alpha, - const T *x, - T *y) const { - context_.CublasCall([&](rocblas_handle handle) { - CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); - }); -} template <> template void Blas::AXPY(int n, T alpha, const T *x, T *y) const { @@ -1514,14 +1007,6 @@ void Blas::AXPY(int n, T alpha, const T *x, T *y) const { }); } -template <> -template -void Blas::SCAL(int n, - const T alpha, - T *x) const { - context_.CublasCall( - [&](rocblas_handle handle) { CUBlas::SCAL(handle, n, &alpha, x, 1); }); -} template <> template void Blas::SCAL(int n, const T alpha, T *x) const { @@ -1529,14 +1014,6 @@ void Blas::SCAL(int n, const T alpha, T *x) const { [&](rocblas_handle handle) { CUBlas::SCAL(handle, n, &alpha, x, 1); }); } -template <> -template -void Blas::VCOPY(int n, - const T *x, - T *y) const { - context_.CublasCall( - [&](rocblas_handle handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }); -} template <> template void Blas::VCOPY(int n, const T *x, T *y) const { @@ -1544,23 +1021,6 @@ void Blas::VCOPY(int n, const T *x, T *y) const { [&](rocblas_handle handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }); } -template <> -template -void Blas::GEMV(bool trans_a, - int M, - int N, - T alpha, - const T *A, - const T *B, - T beta, - T *C) const { - rocblas_operation cuTransA = - !trans_a ? rocblas_operation_transpose : rocblas_operation_none; - - context_.CublasCall([&](rocblas_handle handle) { - CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); - }); -} template <> template void Blas::GEMV(bool trans_a, @@ -1579,26 +1039,6 @@ void Blas::GEMV(bool trans_a, }); } -template <> -template <> -inline void Blas::GEMV( - bool trans_a, - int M, - int N, - phi::dtype::float16 alpha, - const phi::dtype::float16 *A, - const phi::dtype::float16 *B, - phi::dtype::float16 beta, - phi::dtype::float16 *C) const { - // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it. - if (trans_a) { - this->template GEMM( - CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C); - } else { - this->template GEMM( - CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C); - } -} template <> template <> inline void Blas::GEMV(bool trans_a, @@ -1619,26 +1059,6 @@ inline void Blas::GEMV(bool trans_a, } } -template <> -template <> -inline void Blas::GEMV( - bool trans_a, - int M, - int N, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 *A, - const phi::dtype::bfloat16 *B, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 *C) const { - // Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it. - if (trans_a) { - this->template GEMM( - CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C); - } else { - this->template GEMM( - CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C); - } -} template <> template <> inline void Blas::GEMV(bool trans_a, @@ -1659,56 +1079,6 @@ inline void Blas::GEMV(bool trans_a, } } -template <> -template -void Blas::BatchedGEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T *A, - const T *B, - T beta, - T *C, - int batchCount, - int64_t strideA, - int64_t strideB) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; - rocblas_operation cuTransA = (transA == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - rocblas_operation cuTransB = (transB == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - const int64_t strideC = M * N; - context_.CublasCall([&](rocblas_handle handle) { - CUBlas::GEMM_STRIDED_BATCH(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - strideB, - A, - lda, - strideA, - &beta, - C, - ldc, - strideC, - batchCount); - }); -} - template <> template void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -1758,71 +1128,6 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, }); } -template <> -template <> -inline void Blas::BatchedGEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 *A, - const phi::dtype::bfloat16 *B, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 *C, - int batchCount, - int64_t strideA, - int64_t strideB) const { - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; - const int64_t strideC = M * N; - rocblas_operation cuTransA = (transA == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - rocblas_operation cuTransB = (transB == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - float h_alpha = static_cast(alpha); - float h_beta = static_cast(beta); - rocblas_gemm_algo algo = rocblas_gemm_algo_standard; - - context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_gemm_strided_batched_ex( - handle, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - B, - rocblas_datatype_bf16_r, - ldb, - strideB, - A, - rocblas_datatype_bf16_r, - lda, - strideA, - &h_beta, - C, - rocblas_datatype_bf16_r, - ldc, - strideC, - C, - rocblas_datatype_bf16_r, - ldc, - strideC, - batchCount, - rocblas_datatype_f32_r, - algo, - 0, - 0)); - }); -} - template <> template <> inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -1887,26 +1192,6 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, }); } -template <> -template -void Blas::BatchedGEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T **A, - const T **B, - T beta, - T **C, - int batchCount) const { - for (int k = 0; k < batchCount; ++k) { - this->template GEMM( - transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]); - } -} - template <> template void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -1926,25 +1211,6 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, } } -template <> -template <> -inline void Blas::BatchedGEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::float16 alpha, - const phi::dtype::float16 **A, - const phi::dtype::float16 **B, - phi::dtype::float16 beta, - phi::dtype::float16 **C, - int batchCount) const { - for (int k = 0; k < batchCount; ++k) { - this->template GEMM( - transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]); - } -} template <> template <> inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -1964,26 +1230,6 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, } } -template <> -template <> -inline void Blas::BatchedGEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 **A, - const phi::dtype::bfloat16 **B, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 **C, - int batchCount) const { - for (int k = 0; k < batchCount; ++k) { - this->template GEMM( - transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]); - } -} - template <> template <> inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -2003,37 +1249,6 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, } } -template <> -template -void Blas::TRSM(CBLAS_SIDE side, - CBLAS_UPLO uplo, - CBLAS_TRANSPOSE transA, - CBLAS_DIAG diag, - int M, - int N, - T alpha, - const T *A, - int lda, - T *B, - int ldb) const { - // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'` - // where ' stands for transpose - rocblas_side cuSide = - (side == CblasLeft) ? rocblas_side_right : rocblas_side_left; - rocblas_fill cuUplo = - (uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower; - // use CUBLAS_OP_C (conjugate transpose) for complex - rocblas_operation cuTransA = (transA == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - rocblas_diagonal cuDiag = - (diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit; - - context_.CublasCall([&](rocblas_handle handle) { - CUBlas::TRSM( - handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb); - }); -} template <> template void Blas::TRSM(CBLAS_SIDE side, @@ -2066,14 +1281,6 @@ void Blas::TRSM(CBLAS_SIDE side, }); } -template <> -template -void Blas::BatchedGETRF( - int n, T **a, int *ipiv, int *info, int batch_size) const { - context_.CublasCall([&](rocblas_handle handle) { - CUBlas::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size); - }); -} template <> template void Blas::BatchedGETRF( @@ -2083,24 +1290,6 @@ void Blas::BatchedGETRF( }); } -template <> -template -void Blas::BatchedGETRI( - int n, const T **a, const int *ipiv, T **a_inv, int *info, int batch_size) - const { - PADDLE_ENFORCE_NE( - a_inv, - a, - phi::errors::InvalidArgument( - "cuBLAS fuction 'cublasgetrfBatched' cannot be executed " - "in-place. The memory space of output matrix (address: %p) cannot " - "overlap memory space of input matrix (address: %p).", - a_inv, - a)); - context_.CublasCall([&](rocblas_handle handle) { - CUBlas::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size); - }); -} template <> template void Blas::BatchedGETRI(int n, @@ -2123,14 +1312,6 @@ void Blas::BatchedGETRI(int n, }); } -template <> -template -void Blas::BatchedMatInv( - int n, const T **a, T **a_inv, int *info, int batch_size) const { - context_.CublasCall([&](rocblas_handle handle) { - CUBlas::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size); - }); -} template <> template void Blas::BatchedMatInv( @@ -2140,27 +1321,6 @@ void Blas::BatchedMatInv( }); } -template <> -template -void Blas::BatchedGETRS( - CBLAS_TRANSPOSE trans, - int n, - int nrhs, - const T **a, - int lda, - int *ipiv, - T **b, - int ldb, - int *info, - int batch_size) const { - rocblas_operation cuTrans = (trans == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - context_.CublasCall([&](rocblas_handle handle) { - CUBlas::GETRS_BATCH( - handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size); - }); -} template <> template void Blas::BatchedGETRS(CBLAS_TRANSPOSE trans, @@ -2182,50 +1342,6 @@ void Blas::BatchedGETRS(CBLAS_TRANSPOSE trans, }); } -template <> -template -void Blas::BatchedTRSM( - CBLAS_SIDE side, - CBLAS_UPLO uplo, - CBLAS_TRANSPOSE transA, - CBLAS_DIAG diag, - int M, - int N, - T alpha, - const T **A, - int lda, - T **B, - int ldb, - int batch_size) const { - // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'` - // where ' stands for transpose - rocblas_side cuSide = - (side == CblasLeft) ? rocblas_side_right : rocblas_side_left; - rocblas_fill cuUplo = - (uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower; - // use CUBLAS_OP_C (conjugate transpose) for complex - rocblas_operation cuTransA = (transA == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - rocblas_diagonal cuDiag = - (diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit; - - context_.CublasCall([&](rocblas_handle handle) { - CUBlas::TRSM_BATCH(handle, - cuSide, - cuUplo, - cuTransA, - cuDiag, - N, - M, - &alpha, - A, - lda, - B, - ldb, - batch_size); - }); -} template <> template void Blas::BatchedTRSM(CBLAS_SIDE side, diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu index 901551f964..6015266dde 100644 --- a/paddle/phi/kernels/funcs/fc_functor.cu +++ b/paddle/phi/kernels/funcs/fc_functor.cu @@ -313,10 +313,6 @@ void FCFunctor::operator()(const DeviceContext& context, AddReluKernel(context.stream(), M, N, Y, B, relu); } -template class FCFunctor; -template class FCFunctor; -template class FCFunctor; - template class FCFunctor; template class FCFunctor; template class FCFunctor; diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h index 78066ce5b2..4625414e7a 100644 --- a/paddle/phi/kernels/funcs/for_range.h +++ b/paddle/phi/kernels/funcs/for_range.h @@ -91,22 +91,6 @@ struct ForRange { size_t limit_; }; -// NOTE: After the pten kernel is migrated, it needs to be deleted. -template <> -struct ForRange { - ForRange(const paddle::platform::CUDADeviceContext& dev_ctx, size_t limit) - : dev_ctx_(dev_ctx), limit_(limit) {} - - template - inline void operator()(Function func) const { - phi::funcs::ForRange for_range(dev_ctx_, limit_); - for_range(func); - } - - const paddle::platform::CUDADeviceContext& dev_ctx_; - size_t limit_; -}; - #endif } // namespace funcs diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu index 42ba0ba711..bbd160e35c 100644 --- a/paddle/phi/kernels/funcs/math_function.cu +++ b/paddle/phi/kernels/funcs/math_function.cu @@ -31,22 +31,6 @@ namespace funcs { using float16 = phi::dtype::float16; using bfloat16 = phi::dtype::bfloat16; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant>; -template struct SetConstant>; - template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -75,44 +59,18 @@ template struct SetConstant>; -#define DEFINE_GPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose, \ - RANK>; \ - template struct Transpose, \ - RANK>; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose, \ - RANK>; \ +#define DEFINE_GPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose, \ + RANK>; \ template struct Transpose, RANK>; DEFINE_GPU_TRANS(1); @@ -240,8 +198,7 @@ struct TransposeNormal { }; // define transpose normal -#define DEFINE_GPU_TRANS_NORMAL(TYPE) \ - template struct TransposeNormal; \ +#define DEFINE_GPU_TRANS_NORMAL(TYPE) \ template struct TransposeNormal DEFINE_GPU_TRANS_NORMAL(float16); diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc index eef355e688..eb9434396c 100644 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc @@ -131,10 +131,5 @@ void MatrixInverseFunctor::operator()(const Context& dev_ctx, template class MatrixInverseFunctor; template class MatrixInverseFunctor; -// TODO(chenweihang): remove these instantiations later -template class MatrixInverseFunctor; -template class MatrixInverseFunctor; - } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/matrix_solve.cu b/paddle/phi/kernels/funcs/matrix_solve.cu index fccceb7e20..004375bc24 100644 --- a/paddle/phi/kernels/funcs/matrix_solve.cu +++ b/paddle/phi/kernels/funcs/matrix_solve.cu @@ -170,9 +170,5 @@ void MatrixSolveFunctor::operator()(const Context& context, template class MatrixSolveFunctor; template class MatrixSolveFunctor; -// TODO(wuweilong): remove these instantiations later -template class MatrixSolveFunctor; -template class MatrixSolveFunctor; - } // namespace funcs } // namespace phi -- GitLab