未验证 提交 88490567 编写于 作者: L Leo Chen 提交者: GitHub

unify fluid::CUDADeviceContext and phi::GpuContext (#44723)

* remove cudaDeviceContext

* remove more template

* fix rocm compile
上级 0a2db7c8
......@@ -23,12 +23,6 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
class GarbageCollector;
......
......@@ -25,11 +25,6 @@
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
namespace memory {
namespace allocation {
......
......@@ -23,13 +23,6 @@ limitations under the License. */
#include "paddle/fluid/operators/miopen_lstm_cache.h"
#endif
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
......
......@@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
#endif
size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
// first sum pool
FusedSeqpoolKernelNormal<<<config.block_per_grid.x,
config.thread_per_block.x,
......@@ -209,7 +209,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
// not need show click input
N = static_cast<size_t>(batch_size * slot_num *
(embedding_size - cvm_offset));
platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(dev_ctx, N);
FusedCVMKernelNoCVM<<<config.block_per_grid.x,
config.thread_per_block.x,
0,
......@@ -391,7 +392,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
#endif
size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
auto config = GetGpuLaunchConfig1D(dev_ctx, N);
auto config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
if (use_cvm) {
// join grad
FusedSeqpoolCVMGradKernelWithCVM<<<config.block_per_grid.x,
......
......@@ -14,13 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/gru_op.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
......
......@@ -150,11 +150,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
}
}
template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
template class CrossEntropyFunctor<platform::CUDADeviceContext,
platform::float16>;
template class CrossEntropyFunctor<phi::GPUContext, float>;
template class CrossEntropyFunctor<phi::GPUContext, double>;
template class CrossEntropyFunctor<phi::GPUContext, platform::float16>;
......
......@@ -308,24 +308,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
}
};
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
double>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
phi::GPUContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
phi::GPUContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
float>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
phi::GPUContext,
float>;
......@@ -576,12 +564,6 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
}
};
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
double>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
phi::GPUContext,
float>;
......@@ -589,12 +571,6 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
phi::GPUContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
float>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
phi::GPUContext,
float>;
......
......@@ -173,12 +173,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
axis);
}
template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
template class MaxOutFunctor<platform::CUDADeviceContext, float>;
template class MaxOutFunctor<platform::CUDADeviceContext, double>;
template class MaxOutGradFunctor<phi::GPUContext, float>;
template class MaxOutGradFunctor<phi::GPUContext, double>;
......
......@@ -22,12 +22,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/sampler.h"
#include "paddle/phi/core/ddim.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
namespace math {
......
......@@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
}
} // namespace
template <typename T>
struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input1,
const framework::Tensor& input2,
framework::Tensor* output) {
auto in1_height = input1.height();
auto in2_dims = input2.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_EQ(
in1_height,
in2_dims[0],
platform::errors::InvalidArgument(
"The two inputs height must be equal."
"But received first input height = [%d], first input height = [%d]",
in1_height,
in2_dims[0]));
PADDLE_ENFORCE_EQ(
in1_height,
out_dims[0],
platform::errors::InvalidArgument(
"The input and output height must be equal."
"But received input height = [%d], output height = [%d]",
in1_height,
out_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(
in1_row_numel,
input2.numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]",
in1_row_numel,
input2.numel() / in1_height));
PADDLE_ENFORCE_EQ(
in1_row_numel,
output->numel() / in1_height,
platform::errors::InvalidArgument(
"The input and output width must be equal."
"But received input width = [%d], output width = [%d]",
in1_row_numel,
output->numel() / in1_height));
auto* in1_data = in1_value.data<T>();
auto* in2_data = input2.data<T>();
auto* out_data = output->data<T>();
phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
functor(context, output, static_cast<T>(0));
const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid(in1_rows.size(), 1);
paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
SelectedRowsAddTensorKernel<T, block_size>
<<<grid, threads, 0, context.stream()>>>(
in1_data,
mixv_in1_rows.CUDAData(context.GetPlace()),
out_data,
in1_row_numel);
auto out_eigen = framework::EigenVector<T>::Flatten(*output);
auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
}
};
template <typename T>
struct SelectedRowsAddTensor<phi::GPUContext, T> {
void operator()(const phi::GPUContext& context,
......@@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
}
};
template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
platform::float16>;
template struct SelectedRowsAddTensor<phi::GPUContext, float>;
template struct SelectedRowsAddTensor<phi::GPUContext, double>;
template struct SelectedRowsAdd<phi::GPUContext, platform::float16>;
......@@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
}
} // namespace
template <typename T>
struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input1,
framework::Tensor* input2) {
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(
in1_height,
in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But received first input height = "
"[%d], second input height = [%d]",
in1_height,
in2_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(
in1_row_numel,
input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]",
in1_row_numel,
input2->numel() / in1_height));
auto* in1_data = in1_value.data<T>();
auto* in2_data = input2->data<T>();
const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid(in1_rows.size(), 1);
paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
SelectedRowsAddToTensorKernel<T, block_size>
<<<grid, threads, 0, context.stream()>>>(
in1_data,
mixv_in1_rows.CUDAData(context.GetPlace()),
in2_data,
in1_row_numel);
}
};
template <typename T>
struct SelectedRowsAddToTensor<phi::GPUContext, T> {
void operator()(const phi::GPUContext& context,
......@@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
}
};
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
platform::float16>;
template struct SelectedRowsAddToTensor<phi::GPUContext, float>;
template struct SelectedRowsAddToTensor<phi::GPUContext, double>;
template struct SelectedRowsAddToTensor<phi::GPUContext, int>;
......@@ -625,34 +498,6 @@ struct MergeAddImpl {
}
};
template <typename T>
struct MergeAdd<platform::CUDADeviceContext, T> {
// unary functor, merge by adding duplicated rows in
// the input SelectedRows object.
phi::SelectedRows operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input,
const bool sorted_result) {
return MergeAddImpl<platform::CUDADeviceContext, T>()(
context, input, sorted_result);
}
void operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input,
phi::SelectedRows* output,
const bool sorted_result) {
MergeAddImpl<platform::CUDADeviceContext, T>()(
context, input, output, sorted_result);
}
void operator()(const platform::CUDADeviceContext& context,
const std::vector<const phi::SelectedRows*>& inputs,
phi::SelectedRows* output,
const bool sorted_result) {
MergeAddImpl<platform::CUDADeviceContext, T>()(
context, inputs, output, sorted_result);
}
};
template <typename T>
struct MergeAdd<phi::GPUContext, T> {
// unary functor, merge by adding duplicated rows in
......@@ -678,10 +523,8 @@ struct MergeAdd<phi::GPUContext, T> {
}
};
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<platform::CUDADeviceContext, dtype>; \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<platform::CUDADeviceContext, dtype>; \
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<phi::GPUContext, dtype>;
TEMPLATE_SPECIALIZED_FOR_MERGEADD(float)
......
......@@ -57,88 +57,6 @@ __global__ void SequencePaddingKernel(T* dst,
}
}
template <typename T>
class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::LoDTensor& seq_tensor,
framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value,
int pad_seq_len = -1,
int lod_level = 0,
bool norm_by_times = false,
const PadLayout layout = kBatchLengthWidth) {
auto seq_lod = seq_tensor.lod();
auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
const auto& seq_tensor_dims = seq_tensor.dims();
const auto& pad_tensor_dims = pad_tensor->dims();
int max_seq_len = MaximumSequenceLength(seq_offsets);
if (pad_seq_len == -1) {
pad_seq_len = max_seq_len;
}
PADDLE_ENFORCE_GE(
pad_seq_len,
max_seq_len,
platform::errors::InvalidArgument(
"The pad_seq_len must be equal to or greater than the "
"original max sequence length. Expected %ld >= %ld, but got %ld < "
"%ld. Please check the input value.",
pad_seq_len,
max_seq_len,
pad_seq_len,
max_seq_len));
int step_width = seq_tensor.numel() / seq_tensor_dims[0];
int seq_num = seq_offsets.size() - 1;
CheckDims(seq_tensor_dims,
pad_tensor_dims,
seq_offsets,
pad_seq_len,
step_width,
layout);
PADDLE_ENFORCE_EQ(
pad_value.numel() == 1 || pad_value.numel() == step_width,
true,
platform::errors::InvalidArgument(
"The numel of 'pad_value' can only be 1 or be equal to "
"the 'step_width', but got %ld != 1 and %ld. Please check the "
"input value.",
pad_value.numel(),
step_width));
const int kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t block_dim_x =
std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = seq_num;
dim3 grid(grid_dim_x, grid_dim_y);
const T* seq_data = seq_tensor.data<T>();
T* pad_data = pad_tensor->data<T>();
const T* pad_value_data = pad_value.data<T>();
paddle::framework::MixVector<size_t> mix_vector_seq_offsets(&seq_offsets);
SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
pad_data,
seq_data,
pad_value_data,
pad_value.numel() == 1,
mix_vector_seq_offsets.CUDAData(context.GetPlace()),
seq_num,
pad_seq_len,
step_width,
norm_by_times,
layout);
}
};
template <typename T>
class PaddingLoDTensorFunctor<phi::GPUContext, T> {
public:
......@@ -221,73 +139,6 @@ class PaddingLoDTensorFunctor<phi::GPUContext, T> {
}
};
template <typename T>
class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor,
int pad_seq_len = -1,
int lod_level = 0,
bool norm_by_times = false,
const PadLayout layout = kBatchLengthWidth) {
auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
const auto& seq_tensor_dims = seq_tensor->dims();
const auto& pad_tensor_dims = pad_tensor.dims();
int max_seq_len = MaximumSequenceLength(seq_offsets);
if (pad_seq_len == -1) {
pad_seq_len = max_seq_len;
}
int step_width = seq_tensor->numel() / seq_tensor_dims[0];
int seq_num = seq_offsets.size() - 1;
CheckDims(seq_tensor_dims,
pad_tensor_dims,
seq_offsets,
pad_seq_len,
step_width,
layout);
/*
if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
paddle::framework::TensorCopy(pad_tensor, context.GetPlace(), context,
seq_tensor);
seq_tensor->Resize(seq_tensor_dims);
return;
}
*/
const int kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t block_dim_x =
std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = seq_num;
dim3 grid(grid_dim_x, grid_dim_y);
const T* pad_data = pad_tensor.data<T>();
T* seq_data = seq_tensor->data<T>();
paddle::framework::MixVector<size_t> mixv_seq_offsets(&seq_offsets);
SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
seq_data,
pad_data,
nullptr,
false,
mixv_seq_offsets.CUDAData(context.GetPlace()),
seq_num,
pad_seq_len,
step_width,
norm_by_times,
layout);
}
};
template <typename T>
class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
public:
......@@ -355,16 +206,6 @@ class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
}
};
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
template class PaddingLoDTensorFunctor<phi::GPUContext, int>;
template class PaddingLoDTensorFunctor<phi::GPUContext, int64_t>;
template class PaddingLoDTensorFunctor<phi::GPUContext, float>;
......
......@@ -35,43 +35,6 @@ __global__ void SequenceScaleKernel(T* seq,
}
}
template <typename T>
class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const T* scales,
framework::LoDTensor* seq) {
const size_t level = 0;
auto lod = seq->lod();
const size_t num_seq = lod[level].size() - 1;
const size_t seq_width = seq->numel() / seq->dims()[0];
auto abs_offset_lod = framework::ToAbsOffset(lod);
T* seq_data = seq->mutable_data<T>(context.GetPlace());
paddle::framework::MixVector<size_t> mix_vector(&(abs_offset_lod[level]));
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(
HIP_KERNEL_NAME(SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>),
dim3(num_seq),
dim3(PADDLE_CUDA_NUM_THREADS),
0,
context.stream(),
seq_data,
mix_vector.CUDAMutableData(context.GetPlace()),
scales,
seq_width);
#else
SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>
<<<num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
seq_data,
mix_vector.CUDAMutableData(context.GetPlace()),
scales,
seq_width);
#endif
mix_vector.CopyToCPU();
}
};
template <typename T>
class ScaleLoDTensorFunctor<phi::GPUContext, T> {
public:
......@@ -109,9 +72,6 @@ class ScaleLoDTensorFunctor<phi::GPUContext, T> {
}
};
template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, double>;
template class ScaleLoDTensorFunctor<phi::GPUContext, float>;
template class ScaleLoDTensorFunctor<phi::GPUContext, double>;
......
......@@ -141,56 +141,21 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
#endif
}
template class SoftmaxCUDNNFunctor<float, platform::CUDADeviceContext>;
template class SoftmaxCUDNNFunctor<platform::float16,
platform::CUDADeviceContext>;
template class SoftmaxGradCUDNNFunctor<float, platform::CUDADeviceContext>;
template class SoftmaxGradCUDNNFunctor<platform::float16,
platform::CUDADeviceContext>;
template class SoftmaxCUDNNFunctor<float, phi::GPUContext>;
template class SoftmaxCUDNNFunctor<platform::float16, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<float, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<platform::float16, phi::GPUContext>;
#if CUDNN_VERSION_MIN(8, 1, 0)
template class SoftmaxCUDNNFunctor<platform::bfloat16,
platform::CUDADeviceContext>;
template class SoftmaxGradCUDNNFunctor<platform::bfloat16,
platform::CUDADeviceContext>;
template class SoftmaxCUDNNFunctor<platform::bfloat16, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<platform::bfloat16, phi::GPUContext>;
#endif
// MIOPEN do not support double
#ifndef PADDLE_WITH_HIP
template class SoftmaxCUDNNFunctor<double, platform::CUDADeviceContext>;
template class SoftmaxGradCUDNNFunctor<double, platform::CUDADeviceContext>;
template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
#endif
template class SoftmaxFunctor<platform::CUDADeviceContext,
platform::float16,
false>;
template class SoftmaxFunctor<platform::CUDADeviceContext,
platform::float16,
true>;
template class SoftmaxFunctor<platform::CUDADeviceContext,
platform::bfloat16,
false>;
template class SoftmaxFunctor<platform::CUDADeviceContext,
platform::bfloat16,
true>;
template class SoftmaxFunctor<platform::CUDADeviceContext, float, false>;
template class SoftmaxFunctor<platform::CUDADeviceContext, double, false>;
template class SoftmaxFunctor<platform::CUDADeviceContext, float, true>;
template class SoftmaxFunctor<platform::CUDADeviceContext, double, true>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext,
platform::float16>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext,
platform::bfloat16>;
template class SoftmaxFunctor<phi::GPUContext, platform::float16, false>;
template class SoftmaxFunctor<phi::GPUContext, platform::float16, true>;
template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16, false>;
......
......@@ -417,13 +417,9 @@ void Col2VolFunctor<DeviceContext, T>::operator()(
}
// };
template class Vol2ColFunctor<platform::CUDADeviceContext, float>;
template class Vol2ColFunctor<platform::CUDADeviceContext, double>;
template class Vol2ColFunctor<phi::GPUContext, float>;
template class Vol2ColFunctor<phi::GPUContext, double>;
template class Col2VolFunctor<platform::CUDADeviceContext, float>;
template class Col2VolFunctor<platform::CUDADeviceContext, double>;
template class Col2VolFunctor<phi::GPUContext, float>;
template class Col2VolFunctor<phi::GPUContext, double>;
......
......@@ -16,12 +16,6 @@
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
sequence_concat,
paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,
......
......@@ -51,7 +51,6 @@ namespace platform {
//
// The NCCLComm instance is created and reversed in the NCCLCommContext
// singleton with a global user specified group id.
class CUDADeviceContext;
class NCCLComm {
public:
......
......@@ -533,11 +533,6 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
}
CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
: phi::GPUContext(place) {}
CUDADeviceContext::~CUDADeviceContext() = default;
CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice());
}
......
......@@ -271,15 +271,7 @@ struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
class CudnnWorkspaceHandle;
class EigenCudaStreamDevice;
class CUDADeviceContext : public phi::GPUContext {
public:
explicit CUDADeviceContext(CUDAPlace place);
virtual ~CUDADeviceContext();
private:
int place_holder_; // TO BE REMOVED
DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
};
using CUDADeviceContext = phi::GPUContext;
class CudnnWorkspaceHandle {
public:
......
......@@ -96,66 +96,6 @@ struct Transform<phi::CPUContext> {
};
#if defined(__NVCC__) || defined(__HIPCC__)
template <>
struct Transform<platform::CUDADeviceContext> {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const platform::CUDADeviceContext& context,
InputIter first,
InputIter last,
OutputIter result,
UnaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE_EQ(is_gpu_place(place),
true,
platform::errors::PreconditionNotMet(
"The CUDA Transform must be used in GPU place."));
#ifdef __HIPCC__
thrust::transform(thrust::hip::par.on(context.stream()),
details::CastToCUDATransformIterator(first),
details::CastToCUDATransformIterator(last),
details::CastToCUDATransformIterator(result),
op);
#else
thrust::transform(thrust::cuda::par.on(context.stream()),
details::CastToCUDATransformIterator(first),
details::CastToCUDATransformIterator(last),
details::CastToCUDATransformIterator(result),
op);
#endif
}
template <typename InputIter1,
typename InputIter2,
typename OutputIter,
typename BinaryOperation>
void operator()(const platform::CUDADeviceContext& context,
InputIter1 first1,
InputIter1 last1,
InputIter2 first2,
OutputIter result,
BinaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE_EQ(is_gpu_place(place),
true,
platform::errors::PreconditionNotMet(
"The CUDA Transform must be used in GPU place."));
#ifdef __HIPCC__
thrust::transform(thrust::hip::par.on(context.stream()),
details::CastToCUDATransformIterator(first1),
details::CastToCUDATransformIterator(last1),
details::CastToCUDATransformIterator(first2),
details::CastToCUDATransformIterator(result),
op);
#else
thrust::transform(thrust::cuda::par.on(context.stream()),
details::CastToCUDATransformIterator(first1),
details::CastToCUDATransformIterator(last1),
details::CastToCUDATransformIterator(first2),
details::CastToCUDATransformIterator(result),
op);
#endif
}
};
template <>
struct Transform<phi::GPUContext> {
......
......@@ -66,58 +66,6 @@ struct CUBlas<float> {
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float *alpha,
const void *A,
cudaDataType_t Atype,
int lda,
const void *B,
cudaDataType_t Btype,
int ldb,
const float *beta,
void *C,
cudaDataType_t Ctype,
int ldc) {
// Because the gcc 4.8 doesn't expand template parameter pack that
// appears in a lambda-expression, I can not use template parameter pack
// here.
#if CUDA_VERSION >= 8000
VLOG(5) << "use_tensor_op_math: "
<< (dev_ctx->tensor_core_available() ? "True" : "False");
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cublasSgemmEx(handle,
transa,
transb,
m,
n,
k,
alpha,
A,
Atype,
lda,
B,
Btype,
ldb,
beta,
C,
Ctype,
ldc));
});
#else
PADDLE_THROW(phi::errors::Unimplemented(
"cublasSgemmEx is not supported on cuda <= 7.5"));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
......@@ -366,66 +314,6 @@ struct CUBlas<phi::dtype::float16> {
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const void *alpha,
const void *A,
cudaDataType_t Atype,
int lda,
const void *B,
cudaDataType_t Btype,
int ldb,
const void *beta,
void *C,
cudaDataType_t Ctype,
int ldc,
cudaDataType_t computeType) {
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000
bool use_tensor_op_math = dev_ctx->tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
#endif // CUDA_VERSION >= 9000
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cublasGemmEx(handle,
transa,
transb,
m,
n,
k,
alpha,
A,
Atype,
lda,
B,
Btype,
ldb,
beta,
C,
Ctype,
ldc,
computeType,
algo));
});
#else
PADDLE_THROW(phi::errors::Unimplemented(
"cublasGemmEx is not supported on cuda <= 7.5"));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
......@@ -636,66 +524,6 @@ struct CUBlas<phi::dtype::complex<float>> {
ldb));
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const void *alpha,
const void *A,
cudaDataType_t Atype,
int lda,
const void *B,
cudaDataType_t Btype,
int ldb,
const void *beta,
void *C,
cudaDataType_t Ctype,
int ldc,
cudaDataType_t computeType) {
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000
bool use_tensor_op_math = dev_ctx->tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
#endif // CUDA_VERSION >= 9000
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cublasGemmEx(handle,
transa,
transb,
m,
n,
k,
alpha,
A,
Atype,
lda,
B,
Btype,
ldb,
beta,
C,
Ctype,
ldc,
computeType,
algo));
});
#else
PADDLE_THROW(phi::errors::Unimplemented(
"cublasGemmEx is not supported on cuda <= 7.5"));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
......@@ -965,66 +793,6 @@ struct CUBlas<phi::dtype::complex<double>> {
batch_size));
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const void *alpha,
const void *A,
cudaDataType_t Atype,
int lda,
const void *B,
cudaDataType_t Btype,
int ldb,
const void *beta,
void *C,
cudaDataType_t Ctype,
int ldc,
cudaDataType_t computeType) {
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000
bool use_tensor_op_math = dev_ctx->tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
#endif // CUDA_VERSION >= 9000
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cublasGemmEx(handle,
transa,
transb,
m,
n,
k,
alpha,
A,
Atype,
lda,
B,
Btype,
ldb,
beta,
C,
Ctype,
ldc,
computeType,
algo));
});
#else
PADDLE_THROW(phi::errors::Unimplemented(
"cublasGemmEx is not supported on cuda <= 7.5"));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
......@@ -1088,16 +856,16 @@ struct CUBlas<phi::dtype::complex<double>> {
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
T alpha,
const T *A,
const T *B,
T beta,
T *C) const {
void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
T alpha,
const T *A,
const T *B,
T beta,
T *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
......@@ -1109,8 +877,7 @@ void Blas<paddle::platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
#if CUDA_VERSION >= 8000
if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
auto &cuda_ctx =
const_cast<paddle::platform::CUDADeviceContext &>(context_);
auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
CUBlas<T>::GEMM_EX(&cuda_ctx,
cuTransB,
cuTransA,
......@@ -1152,151 +919,6 @@ void Blas<paddle::platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 8000
}
template <>
template <typename T>
void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
T alpha,
const T *A,
const T *B,
T beta,
T *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
#if CUDA_VERSION >= 8000
if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
CUBlas<T>::GEMM_EX(&cuda_ctx,
cuTransB,
cuTransA,
N,
M,
K,
&alpha,
B,
CUDA_R_32F,
ldb,
A,
CUDA_R_32F,
lda,
&beta,
C,
CUDA_R_32F,
N);
} else {
#endif // CUDA_VERSION >= 8000
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMM(handle,
cuTransB,
cuTransA,
N,
M,
K,
&alpha,
B,
ldb,
A,
lda,
&beta,
C,
N);
});
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::float16 alpha,
const phi::dtype::float16 *A,
const phi::dtype::float16 *B,
phi::dtype::float16 beta,
phi::dtype::float16 *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(),
53,
phi::errors::InvalidArgument(
"cublas fp16 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto &cuda_ctx = const_cast<paddle::platform::CUDADeviceContext &>(context_);
CUBlas<phi::dtype::float16>::GEMM_EX(&cuda_ctx,
cuTransB,
cuTransA,
N,
M,
K,
&h_alpha,
B,
CUDA_R_16F,
ldb,
A,
CUDA_R_16F,
lda,
&h_beta,
C,
CUDA_R_16F,
N,
CUDA_R_32F);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<phi::dtype::float16>::GEMM(handle,
cuTransB,
cuTransA,
N,
M,
K,
&h_alpha,
h_B,
ldb,
h_A,
lda,
&h_beta,
h_C,
N);
});
#endif // CUDA_VERSION >= 8000
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
......@@ -1376,77 +998,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 8000
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::bfloat16 alpha,
const phi::dtype::bfloat16 *A,
const phi::dtype::bfloat16 *B,
phi::dtype::bfloat16 beta,
phi::dtype::bfloat16 *C) const {
#if CUDA_VERSION >= 11000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(),
80,
phi::errors::InvalidArgument(
"cublas fp16 gemm requires GPU compute capability >= 80,"
"but received %d",
context_.GetComputeCapability()));
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
bool use_tensor_op_math = context_.tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cublasGemmEx(handle,
cuTransB,
cuTransA,
N,
M,
K,
&h_alpha,
B,
CUDA_R_16BF,
ldb,
A,
CUDA_R_16BF,
lda,
&h_beta,
C,
CUDA_R_16BF,
N,
CUDA_R_32F,
algo));
});
#else
// raise error
PADDLE_THROW(phi::errors::Unimplemented(
"cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
#endif // CUDA_VERSION >= 11000
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
......@@ -1517,87 +1068,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 11000
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::complex<float> alpha,
const phi::dtype::complex<float> *A,
const phi::dtype::complex<float> *B,
phi::dtype::complex<float> beta,
phi::dtype::complex<float> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(),
53,
phi::errors::InvalidArgument(
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
thrust::complex<float> c_alpha =
thrust::complex<float>(alpha.real, alpha.imag);
thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto &cuda_ctx = const_cast<paddle::platform::CUDADeviceContext &>(context_);
CUBlas<phi::dtype::complex<float>>::GEMM_EX(&cuda_ctx,
cuTransB,
cuTransA,
N,
M,
K,
&c_alpha,
B,
CUDA_C_32F,
ldb,
A,
CUDA_C_32F,
lda,
&c_beta,
C,
CUDA_C_32F,
N,
CUDA_C_32F);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<phi::dtype::complex<float>>::GEMM(handle,
cuTransB,
cuTransA,
N,
M,
K,
&c_alpha,
h_B,
ldb,
h_A,
lda,
&c_beta,
h_C,
N);
});
#endif // CUDA_VERSION >= 8000
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
......@@ -1680,17 +1150,16 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::complex<double> alpha,
const phi::dtype::complex<double> *A,
const phi::dtype::complex<double> *B,
phi::dtype::complex<double> beta,
phi::dtype::complex<double> *C) const {
inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::complex<double> alpha,
const phi::dtype::complex<double> *A,
const phi::dtype::complex<double> *B,
phi::dtype::complex<double> beta,
phi::dtype::complex<double> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
......@@ -1719,7 +1188,7 @@ inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto &cuda_ctx = const_cast<paddle::platform::CUDADeviceContext &>(context_);
auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
CUBlas<phi::dtype::complex<double>>::GEMM_EX(&cuda_ctx,
cuTransB,
cuTransA,
......@@ -1760,153 +1229,6 @@ inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
#endif // CUDA_VERSION >= 8000
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::complex<double> alpha,
const phi::dtype::complex<double> *A,
const phi::dtype::complex<double> *B,
phi::dtype::complex<double> beta,
phi::dtype::complex<double> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(),
53,
phi::errors::InvalidArgument(
"cublas complex128 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
thrust::complex<double> c_alpha =
thrust::complex<double>(alpha.real, alpha.imag);
thrust::complex<double> c_beta =
thrust::complex<double>(beta.real, beta.imag);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
CUBlas<phi::dtype::complex<double>>::GEMM_EX(&cuda_ctx,
cuTransB,
cuTransA,
N,
M,
K,
&c_alpha,
B,
CUDA_C_64F,
ldb,
A,
CUDA_C_64F,
lda,
&c_beta,
C,
CUDA_C_64F,
N,
CUDA_C_64F);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<phi::dtype::complex<double>>::GEMM(handle,
cuTransB,
cuTransA,
N,
M,
K,
&c_alpha,
h_B,
ldb,
h_A,
lda,
&c_beta,
h_C,
N);
});
#endif // CUDA_VERSION >= 8000
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::GEMM(bool transA,
bool transB,
int M,
int N,
int K,
T alpha,
const T *A,
int lda,
const T *B,
int ldb,
T beta,
T *C,
int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
#if CUDA_VERSION >= 8000
if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
auto &cuda_ctx =
const_cast<paddle::platform::CUDADeviceContext &>(context_);
CUBlas<T>::GEMM_EX(&cuda_ctx,
cuTransB,
cuTransA,
N,
M,
K,
&alpha,
B,
CUDA_R_32F,
ldb,
A,
CUDA_R_32F,
lda,
&beta,
C,
CUDA_R_32F,
ldc);
} else {
#endif // CUDA_VERSION >= 8000
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMM(handle,
cuTransB,
cuTransA,
N,
M,
K,
&alpha,
B,
ldb,
A,
lda,
&beta,
C,
ldc);
});
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template <>
template <typename T>
void Blas<phi::GPUContext>::GEMM(bool transA,
......@@ -1972,45 +1294,6 @@ void Blas<phi::GPUContext>::GEMM(bool transA,
#endif // CUDA_VERSION >= 8000
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
bool transA,
bool transB,
int M,
int N,
int K,
phi::dtype::float16 alpha,
const phi::dtype::float16 *A,
int lda,
const phi::dtype::float16 *B,
int ldb,
phi::dtype::float16 beta,
phi::dtype::float16 *C,
int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<phi::dtype::float16>::GEMM(handle,
cuTransB,
cuTransA,
N,
M,
K,
&alpha,
B,
ldb,
A,
lda,
&beta,
C,
ldc);
});
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMM(bool transA,
......@@ -2049,17 +1332,6 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::AXPY(int n,
T alpha,
const T *x,
T *y) const {
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
......@@ -2068,15 +1340,6 @@ void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::SCAL(int n,
const T alpha,
T *x) const {
context_.CublasCall(
[&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
}
template <>
template <typename T>
void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
......@@ -2086,247 +1349,67 @@ void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::VCOPY(int n,
const T *x,
T *y) const {
context_.CublasCall(
[&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
}
template <>
template <typename T>
void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
context_.CublasCall(
[&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::GEMV(bool trans_a,
int M,
int N,
T alpha,
const T *A,
const T *B,
T beta,
T *C) const {
cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::GEMV(bool trans_a,
int M,
int N,
T alpha,
const T *A,
const T *B,
T beta,
T *C) const {
cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
});
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::GEMV(
bool trans_a,
int M,
int N,
phi::dtype::float16 alpha,
const phi::dtype::float16 *A,
const phi::dtype::float16 *B,
phi::dtype::float16 beta,
phi::dtype::float16 *C) const {
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if (trans_a) {
this->template GEMM<phi::dtype::float16>(
CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
} else {
this->template GEMM<phi::dtype::float16>(
CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
}
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
int M,
int N,
phi::dtype::float16 alpha,
const phi::dtype::float16 *A,
const phi::dtype::float16 *B,
phi::dtype::float16 beta,
phi::dtype::float16 *C) const {
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if (trans_a) {
this->template GEMM<phi::dtype::float16>(
CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
} else {
this->template GEMM<phi::dtype::float16>(
CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
}
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::GEMV(
bool trans_a,
int M,
int N,
phi::dtype::bfloat16 alpha,
const phi::dtype::bfloat16 *A,
const phi::dtype::bfloat16 *B,
phi::dtype::bfloat16 beta,
phi::dtype::bfloat16 *C) const {
// Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
// it.
if (trans_a) {
this->template GEMM<phi::dtype::bfloat16>(
CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
} else {
this->template GEMM<phi::dtype::bfloat16>(
CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
}
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
int M,
int N,
phi::dtype::bfloat16 alpha,
const phi::dtype::bfloat16 *A,
const phi::dtype::bfloat16 *B,
phi::dtype::bfloat16 beta,
phi::dtype::bfloat16 *C) const {
// Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
// it.
if (trans_a) {
this->template GEMM<phi::dtype::bfloat16>(
CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
} else {
this->template GEMM<phi::dtype::bfloat16>(
CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
}
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
T alpha,
const T *A,
const T *B,
T beta,
T *C,
int batchCount,
int64_t strideA,
int64_t strideB) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int64_t strideC = M * N;
#if CUDA_VERSION >= 9010
if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
std::is_same<T, phi::dtype::float16>::value) {
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
bool use_tensor_op_math = context_.tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
VLOG(4) << "use_half_precision_compute_type: "
<< FLAGS_gemm_use_half_precision_compute_type;
auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
cudaDataType_t compute_type = fp;
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
void *a = static_cast<void *>(&h_alpha);
void *b = static_cast<void *>(&h_beta);
// set ComputeType as CUDA_R_32F for fp16, for better accuracy
if (FLAGS_gemm_use_half_precision_compute_type == true &&
std::is_same<T, phi::dtype::float16>::value) {
a = static_cast<void *>(&alpha);
b = static_cast<void *>(&beta);
compute_type = CUDA_R_16F;
}
// set ComputeType as CUDA_R_32F for fp16 and fp32, for better accuracy
context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cublasGemmStridedBatchedEx(handle,
cuTransB,
cuTransA,
N,
M,
K,
a,
B,
fp,
ldb,
strideB,
A,
fp,
lda,
strideA,
b,
C,
fp,
ldc,
strideC,
batchCount,
compute_type,
algo));
});
} else {
#endif // CUDA_VERSION >= 9010
void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
context_.CublasCall(
[&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
}
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMM_STRIDED_BATCH(handle,
cuTransB,
cuTransA,
N,
M,
K,
&alpha,
B,
ldb,
strideB,
A,
lda,
strideA,
&beta,
C,
ldc,
strideC,
batchCount);
});
template <>
template <typename T>
void Blas<phi::GPUContext>::GEMV(bool trans_a,
int M,
int N,
T alpha,
const T *A,
const T *B,
T beta,
T *C) const {
cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
#if CUDA_VERSION >= 9010
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
});
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
int M,
int N,
phi::dtype::float16 alpha,
const phi::dtype::float16 *A,
const phi::dtype::float16 *B,
phi::dtype::float16 beta,
phi::dtype::float16 *C) const {
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if (trans_a) {
this->template GEMM<phi::dtype::float16>(
CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
} else {
this->template GEMM<phi::dtype::float16>(
CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
}
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
int M,
int N,
phi::dtype::bfloat16 alpha,
const phi::dtype::bfloat16 *A,
const phi::dtype::bfloat16 *B,
phi::dtype::bfloat16 beta,
phi::dtype::bfloat16 *C) const {
// Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
// it.
if (trans_a) {
this->template GEMM<phi::dtype::bfloat16>(
CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
} else {
this->template GEMM<phi::dtype::bfloat16>(
CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
}
#endif // CUDA_VERSION >= 9010
}
template <>
......@@ -2438,78 +1521,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 9010
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::bfloat16 alpha,
const phi::dtype::bfloat16 *A,
const phi::dtype::bfloat16 *B,
phi::dtype::bfloat16 beta,
phi::dtype::bfloat16 *C,
int batchCount,
int64_t strideA,
int64_t strideB) const {
#if CUDA_VERSION >= 11000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int64_t strideC = M * N;
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
bool use_tensor_op_math = context_.tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cublasGemmStridedBatchedEx(
handle,
cuTransB,
cuTransA,
N,
M,
K,
&h_alpha,
B,
CUDA_R_16BF,
ldb,
strideB,
A,
CUDA_R_16BF,
lda,
strideA,
&h_beta,
C,
CUDA_R_16BF,
ldc,
strideC,
batchCount,
CUBLAS_COMPUTE_32F,
algo));
});
#else
// raise error
PADDLE_THROW(phi::errors::Unimplemented(
"cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
"11"));
#endif // CUDA_VERSION >= 11000
}
template <>
template <>
inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
......@@ -2582,26 +1593,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 11000
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
T alpha,
const T **A,
const T **B,
T beta,
T **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<T>(
transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
}
}
template <>
template <typename T>
void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
......@@ -2621,26 +1612,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::float16 alpha,
const phi::dtype::float16 **A,
const phi::dtype::float16 **B,
phi::dtype::float16 beta,
phi::dtype::float16 **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<phi::dtype::float16>(
transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
}
}
template <>
template <>
inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
......@@ -2660,26 +1631,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::bfloat16 alpha,
const phi::dtype::bfloat16 **A,
const phi::dtype::bfloat16 **B,
phi::dtype::bfloat16 beta,
phi::dtype::bfloat16 **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<phi::dtype::bfloat16>(
transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
}
}
template <>
template <>
inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
......@@ -2699,37 +1650,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side,
CBLAS_UPLO uplo,
CBLAS_TRANSPOSE transA,
CBLAS_DIAG diag,
int M,
int N,
T alpha,
const T *A,
int lda,
T *B,
int ldb) const {
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
cublasSideMode_t cuSide =
(side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
cublasFillMode_t cuUplo =
(uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasDiagType_t cuDiag =
(diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::TRSM(
handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
......@@ -2761,15 +1681,6 @@ void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::BatchedGETRF(
int n, T **a, int *ipiv, int *info, int batch_size) const {
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::BatchedGETRF(
......@@ -2779,25 +1690,6 @@ void Blas<phi::GPUContext>::BatchedGETRF(
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::BatchedGETRI(
int n, const T **a, const int *ipiv, T **a_inv, int *info, int batch_size)
const {
PADDLE_ENFORCE_NE(
a_inv,
a,
phi::errors::InvalidArgument(
"cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
"in-place. The memory space of output matrix (address: %p) cannot "
"overlap memory space of input matrix (address: %p).",
a_inv,
a));
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::BatchedGETRI(int n,
......@@ -2820,15 +1712,6 @@ void Blas<phi::GPUContext>::BatchedGETRI(int n,
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::BatchedMatInv(
int n, const T **a, T **a_inv, int *info, int batch_size) const {
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::BatchedMatInv(
......@@ -2838,28 +1721,6 @@ void Blas<phi::GPUContext>::BatchedMatInv(
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::BatchedGETRS(
CBLAS_TRANSPOSE trans,
int n,
int nrhs,
const T **a,
int lda,
int *ipiv,
T **b,
int ldb,
int *info,
int batch_size) const {
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t cuTrans =
(trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GETRS_BATCH(
handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
......@@ -2881,50 +1742,6 @@ void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::BatchedTRSM(
CBLAS_SIDE side,
CBLAS_UPLO uplo,
CBLAS_TRANSPOSE transA,
CBLAS_DIAG diag,
int M,
int N,
T alpha,
const T **A,
int lda,
T **B,
int ldb,
int batch_size) const {
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
cublasSideMode_t cuSide =
(side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
cublasFillMode_t cuUplo =
(uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasDiagType_t cuDiag =
(diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::TRSM_BATCH(handle,
cuSide,
cuUplo,
cuTransA,
cuDiag,
N,
M,
&alpha,
A,
lda,
B,
ldb,
batch_size);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::BatchedTRSM(CBLAS_SIDE side,
......
......@@ -257,54 +257,6 @@ struct CUBlas<phi::dtype::float16> {
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx,
rocblas_operation transa,
rocblas_operation transb,
int m,
int n,
int k,
const void *alpha,
const void *A,
rocblas_datatype Atype,
int lda,
const void *B,
rocblas_datatype Btype,
int ldb,
const void *beta,
void *C,
rocblas_datatype Ctype,
int ldc,
rocblas_datatype computeType) {
rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::rocblas_gemm_ex(handle,
transa,
transb,
m,
n,
k,
alpha,
A,
Atype,
lda,
B,
Btype,
ldb,
beta,
C,
Ctype,
ldc,
C,
Ctype,
ldc,
computeType,
algo,
0,
0));
});
}
template <typename... ARGS>
static void GEMM_EX(phi::GPUContext *dev_ctx,
rocblas_operation transa,
rocblas_operation transb,
......@@ -474,54 +426,6 @@ struct CUBlas<phi::dtype::complex<float>> {
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx,
rocblas_operation transa,
rocblas_operation transb,
int m,
int n,
int k,
const void *alpha,
const void *A,
rocblas_datatype Atype,
int lda,
const void *B,
rocblas_datatype Btype,
int ldb,
const void *beta,
void *C,
rocblas_datatype Ctype,
int ldc,
rocblas_datatype computeType) {
rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::rocblas_gemm_ex(handle,
transa,
transb,
m,
n,
k,
alpha,
A,
Atype,
lda,
B,
Btype,
ldb,
beta,
C,
Ctype,
ldc,
C,
Ctype,
ldc,
computeType,
algo,
0,
0));
});
}
template <typename... ARGS>
static void GEMM_EX(phi::GPUContext *dev_ctx,
rocblas_operation transa,
rocblas_operation transb,
......@@ -692,54 +596,6 @@ struct CUBlas<phi::dtype::complex<double>> {
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(paddle::platform::CUDADeviceContext *dev_ctx,
rocblas_operation transa,
rocblas_operation transb,
int m,
int n,
int k,
const void *alpha,
const void *A,
rocblas_datatype Atype,
int lda,
const void *B,
rocblas_datatype Btype,
int ldb,
const void *beta,
void *C,
rocblas_datatype Ctype,
int ldc,
rocblas_datatype computeType) {
rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::rocblas_gemm_ex(handle,
transa,
transb,
m,
n,
k,
alpha,
A,
Atype,
lda,
B,
Btype,
ldb,
beta,
C,
Ctype,
ldc,
C,
Ctype,
ldc,
computeType,
algo,
0,
0));
});
}
template <typename... ARGS>
static void GEMM_EX(phi::GPUContext *dev_ctx,
rocblas_operation transa,
rocblas_operation transb,
......@@ -789,45 +645,6 @@ struct CUBlas<phi::dtype::complex<double>> {
}
};
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
T alpha,
const T *A,
const T *B,
T beta,
T *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GEMM(handle,
cuTransB,
cuTransA,
N,
M,
K,
&alpha,
B,
ldb,
A,
lda,
&beta,
C,
N);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
......@@ -868,62 +685,6 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
});
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::float16 alpha,
const phi::dtype::float16 *A,
const phi::dtype::float16 *B,
phi::dtype::float16 beta,
phi::dtype::float16 *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(),
53,
phi::errors::InvalidArgument(
"cublas fp16 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
auto &cuda_ctx = const_cast<paddle::platform::CUDADeviceContext &>(context_);
CUBlas<phi::dtype::float16>::GEMM_EX(&cuda_ctx,
cuTransB,
cuTransA,
N,
M,
K,
&h_alpha,
B,
rocblas_datatype_f16_r,
ldb,
A,
rocblas_datatype_f16_r,
lda,
&h_beta,
C,
rocblas_datatype_f16_r,
N,
rocblas_datatype_f32_r);
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
......@@ -982,17 +743,16 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::bfloat16 alpha,
const phi::dtype::bfloat16 *A,
const phi::dtype::bfloat16 *B,
phi::dtype::bfloat16 beta,
phi::dtype::bfloat16 *C) const {
inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::bfloat16 alpha,
const phi::dtype::bfloat16 *A,
const phi::dtype::bfloat16 *B,
phi::dtype::bfloat16 beta,
phi::dtype::bfloat16 *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
......@@ -1052,11 +812,11 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
int M,
int N,
int K,
phi::dtype::bfloat16 alpha,
const phi::dtype::bfloat16 *A,
const phi::dtype::bfloat16 *B,
phi::dtype::bfloat16 beta,
phi::dtype::bfloat16 *C) const {
phi::dtype::complex<float> alpha,
const phi::dtype::complex<float> *A,
const phi::dtype::complex<float> *B,
phi::dtype::complex<float> beta,
phi::dtype::complex<float> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
......@@ -1067,140 +827,19 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
// TODO(zhiqiu): 80 has the same meaning for rocm and cuda?
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(),
80,
53,
phi::errors::InvalidArgument(
"rocblas fp16 gemm requires GPU compute capability >= 80,"
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::rocblas_gemm_ex(handle,
cuTransB,
cuTransA,
N,
M,
K,
&h_alpha,
B,
rocblas_datatype_bf16_r,
ldb,
A,
rocblas_datatype_bf16_r,
lda,
&h_beta,
C,
rocblas_datatype_bf16_r,
N,
C,
rocblas_datatype_bf16_r,
N,
rocblas_datatype_f32_r,
algo,
0,
0));
});
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::complex<float> alpha,
const phi::dtype::complex<float> *A,
const phi::dtype::complex<float> *B,
phi::dtype::complex<float> beta,
phi::dtype::complex<float> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(),
53,
phi::errors::InvalidArgument(
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
thrust::complex<float> c_alpha =
thrust::complex<float>(alpha.real, alpha.imag);
thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
auto &cuda_ctx = const_cast<paddle::platform::CUDADeviceContext &>(context_);
CUBlas<phi::dtype::complex<float>>::GEMM_EX(&cuda_ctx,
cuTransB,
cuTransA,
N,
M,
K,
&c_alpha,
B,
rocblas_datatype_f32_c,
ldb,
A,
rocblas_datatype_f32_c,
lda,
&c_beta,
C,
rocblas_datatype_f32_c,
N,
rocblas_datatype_f32_c);
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::complex<float> alpha,
const phi::dtype::complex<float> *A,
const phi::dtype::complex<float> *B,
phi::dtype::complex<float> beta,
phi::dtype::complex<float> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(),
53,
phi::errors::InvalidArgument(
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
thrust::complex<float> c_alpha =
thrust::complex<float>(alpha.real, alpha.imag);
thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
thrust::complex<float> c_alpha =
thrust::complex<float>(alpha.real, alpha.imag);
thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
CUBlas<phi::dtype::complex<float>>::GEMM_EX(&cuda_ctx,
......@@ -1223,64 +862,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
rocblas_datatype_f32_c);
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::complex<double> alpha,
const phi::dtype::complex<double> *A,
const phi::dtype::complex<double> *B,
phi::dtype::complex<double> beta,
phi::dtype::complex<double> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(),
53,
phi::errors::InvalidArgument(
"cublas complex128 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
thrust::complex<double> c_alpha =
thrust::complex<double>(alpha.real, alpha.imag);
thrust::complex<double> c_beta =
thrust::complex<double>(beta.real, beta.imag);
auto &cuda_ctx = const_cast<paddle::platform::CUDADeviceContext &>(context_);
CUBlas<phi::dtype::complex<double>>::GEMM_EX(&cuda_ctx,
cuTransB,
cuTransA,
N,
M,
K,
&c_alpha,
B,
rocblas_datatype_f64_c,
ldb,
A,
rocblas_datatype_f64_c,
lda,
&c_beta,
C,
rocblas_datatype_f64_c,
N,
rocblas_datatype_f64_c);
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
......@@ -1339,44 +920,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
rocblas_datatype_f64_c);
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::GEMM(bool transA,
bool transB,
int M,
int N,
int K,
T alpha,
const T *A,
int lda,
const T *B,
int ldb,
T beta,
T *C,
int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
rocblas_operation cuTransA =
transA ? rocblas_operation_transpose : rocblas_operation_none;
rocblas_operation cuTransB =
transB ? rocblas_operation_transpose : rocblas_operation_none;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GEMM(handle,
cuTransB,
cuTransA,
N,
M,
K,
&alpha,
B,
ldb,
A,
lda,
&beta,
C,
ldc);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::GEMM(bool transA,
......@@ -1416,46 +959,6 @@ void Blas<phi::GPUContext>::GEMM(bool transA,
});
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
bool transA,
bool transB,
int M,
int N,
int K,
phi::dtype::float16 alpha,
const phi::dtype::float16 *A,
int lda,
const phi::dtype::float16 *B,
int ldb,
phi::dtype::float16 beta,
phi::dtype::float16 *C,
int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
rocblas_operation cuTransA =
transA ? rocblas_operation_transpose : rocblas_operation_none;
rocblas_operation cuTransB =
transB ? rocblas_operation_transpose : rocblas_operation_none;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<phi::dtype::float16>::GEMM(handle,
cuTransB,
cuTransA,
N,
M,
K,
&alpha,
B,
ldb,
A,
lda,
&beta,
C,
ldc);
});
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMM(bool transA,
......@@ -1496,16 +999,6 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::AXPY(int n,
T alpha,
const T *x,
T *y) const {
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
......@@ -1514,14 +1007,6 @@ void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::SCAL(int n,
const T alpha,
T *x) const {
context_.CublasCall(
[&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
}
template <>
template <typename T>
void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
......@@ -1529,14 +1014,6 @@ void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
[&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::VCOPY(int n,
const T *x,
T *y) const {
context_.CublasCall(
[&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
}
template <>
template <typename T>
void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
......@@ -1544,23 +1021,6 @@ void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
[&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::GEMV(bool trans_a,
int M,
int N,
T alpha,
const T *A,
const T *B,
T beta,
T *C) const {
rocblas_operation cuTransA =
!trans_a ? rocblas_operation_transpose : rocblas_operation_none;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::GEMV(bool trans_a,
......@@ -1579,26 +1039,6 @@ void Blas<phi::GPUContext>::GEMV(bool trans_a,
});
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::GEMV(
bool trans_a,
int M,
int N,
phi::dtype::float16 alpha,
const phi::dtype::float16 *A,
const phi::dtype::float16 *B,
phi::dtype::float16 beta,
phi::dtype::float16 *C) const {
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if (trans_a) {
this->template GEMM<phi::dtype::float16>(
CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
} else {
this->template GEMM<phi::dtype::float16>(
CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
}
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
......@@ -1619,26 +1059,6 @@ inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
}
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::GEMV(
bool trans_a,
int M,
int N,
phi::dtype::bfloat16 alpha,
const phi::dtype::bfloat16 *A,
const phi::dtype::bfloat16 *B,
phi::dtype::bfloat16 beta,
phi::dtype::bfloat16 *C) const {
// Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it.
if (trans_a) {
this->template GEMM<phi::dtype::bfloat16>(
CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
} else {
this->template GEMM<phi::dtype::bfloat16>(
CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
}
}
template <>
template <>
inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
......@@ -1659,56 +1079,6 @@ inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
}
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
T alpha,
const T *A,
const T *B,
T beta,
T *C,
int batchCount,
int64_t strideA,
int64_t strideB) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
const int64_t strideC = M * N;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GEMM_STRIDED_BATCH(handle,
cuTransB,
cuTransA,
N,
M,
K,
&alpha,
B,
ldb,
strideB,
A,
lda,
strideA,
&beta,
C,
ldc,
strideC,
batchCount);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
......@@ -1758,71 +1128,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
});
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::bfloat16 alpha,
const phi::dtype::bfloat16 *A,
const phi::dtype::bfloat16 *B,
phi::dtype::bfloat16 beta,
phi::dtype::bfloat16 *C,
int batchCount,
int64_t strideA,
int64_t strideB) const {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
const int64_t strideC = M * N;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::rocblas_gemm_strided_batched_ex(
handle,
cuTransB,
cuTransA,
N,
M,
K,
&h_alpha,
B,
rocblas_datatype_bf16_r,
ldb,
strideB,
A,
rocblas_datatype_bf16_r,
lda,
strideA,
&h_beta,
C,
rocblas_datatype_bf16_r,
ldc,
strideC,
C,
rocblas_datatype_bf16_r,
ldc,
strideC,
batchCount,
rocblas_datatype_f32_r,
algo,
0,
0));
});
}
template <>
template <>
inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
......@@ -1887,26 +1192,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
T alpha,
const T **A,
const T **B,
T beta,
T **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<T>(
transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
}
}
template <>
template <typename T>
void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
......@@ -1926,25 +1211,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::float16 alpha,
const phi::dtype::float16 **A,
const phi::dtype::float16 **B,
phi::dtype::float16 beta,
phi::dtype::float16 **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<phi::dtype::float16>(
transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
}
}
template <>
template <>
inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
......@@ -1964,26 +1230,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
template <>
template <>
inline void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
phi::dtype::bfloat16 alpha,
const phi::dtype::bfloat16 **A,
const phi::dtype::bfloat16 **B,
phi::dtype::bfloat16 beta,
phi::dtype::bfloat16 **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<phi::dtype::bfloat16>(
transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
}
}
template <>
template <>
inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
......@@ -2003,37 +1249,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side,
CBLAS_UPLO uplo,
CBLAS_TRANSPOSE transA,
CBLAS_DIAG diag,
int M,
int N,
T alpha,
const T *A,
int lda,
T *B,
int ldb) const {
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
rocblas_side cuSide =
(side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
rocblas_fill cuUplo =
(uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
// use CUBLAS_OP_C (conjugate transpose) for complex
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_diagonal cuDiag =
(diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::TRSM(
handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
......@@ -2066,14 +1281,6 @@ void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::BatchedGETRF(
int n, T **a, int *ipiv, int *info, int batch_size) const {
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::BatchedGETRF(
......@@ -2083,24 +1290,6 @@ void Blas<phi::GPUContext>::BatchedGETRF(
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::BatchedGETRI(
int n, const T **a, const int *ipiv, T **a_inv, int *info, int batch_size)
const {
PADDLE_ENFORCE_NE(
a_inv,
a,
phi::errors::InvalidArgument(
"cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
"in-place. The memory space of output matrix (address: %p) cannot "
"overlap memory space of input matrix (address: %p).",
a_inv,
a));
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::BatchedGETRI(int n,
......@@ -2123,14 +1312,6 @@ void Blas<phi::GPUContext>::BatchedGETRI(int n,
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::BatchedMatInv(
int n, const T **a, T **a_inv, int *info, int batch_size) const {
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::BatchedMatInv(
......@@ -2140,27 +1321,6 @@ void Blas<phi::GPUContext>::BatchedMatInv(
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::BatchedGETRS(
CBLAS_TRANSPOSE trans,
int n,
int nrhs,
const T **a,
int lda,
int *ipiv,
T **b,
int ldb,
int *info,
int batch_size) const {
rocblas_operation cuTrans = (trans == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GETRS_BATCH(
handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
......@@ -2182,50 +1342,6 @@ void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
});
}
template <>
template <typename T>
void Blas<paddle::platform::CUDADeviceContext>::BatchedTRSM(
CBLAS_SIDE side,
CBLAS_UPLO uplo,
CBLAS_TRANSPOSE transA,
CBLAS_DIAG diag,
int M,
int N,
T alpha,
const T **A,
int lda,
T **B,
int ldb,
int batch_size) const {
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
rocblas_side cuSide =
(side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
rocblas_fill cuUplo =
(uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
// use CUBLAS_OP_C (conjugate transpose) for complex
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_diagonal cuDiag =
(diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::TRSM_BATCH(handle,
cuSide,
cuUplo,
cuTransA,
cuDiag,
N,
M,
&alpha,
A,
lda,
B,
ldb,
batch_size);
});
}
template <>
template <typename T>
void Blas<phi::GPUContext>::BatchedTRSM(CBLAS_SIDE side,
......
......@@ -313,10 +313,6 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
AddReluKernel(context.stream(), M, N, Y, B, relu);
}
template class FCFunctor<paddle::platform::CUDADeviceContext, float16>;
template class FCFunctor<paddle::platform::CUDADeviceContext, float>;
template class FCFunctor<paddle::platform::CUDADeviceContext, double>;
template class FCFunctor<GPUContext, float16>;
template class FCFunctor<GPUContext, float>;
template class FCFunctor<GPUContext, double>;
......
......@@ -91,22 +91,6 @@ struct ForRange<phi::GPUContext> {
size_t limit_;
};
// NOTE: After the pten kernel is migrated, it needs to be deleted.
template <>
struct ForRange<paddle::platform::CUDADeviceContext> {
ForRange(const paddle::platform::CUDADeviceContext& dev_ctx, size_t limit)
: dev_ctx_(dev_ctx), limit_(limit) {}
template <typename Function>
inline void operator()(Function func) const {
phi::funcs::ForRange<phi::GPUContext> for_range(dev_ctx_, limit_);
for_range(func);
}
const paddle::platform::CUDADeviceContext& dev_ctx_;
size_t limit_;
};
#endif
} // namespace funcs
......
......@@ -31,22 +31,6 @@ namespace funcs {
using float16 = phi::dtype::float16;
using bfloat16 = phi::dtype::bfloat16;
template struct SetConstant<paddle::platform::CUDADeviceContext,
phi::dtype::float16>;
template struct SetConstant<paddle::platform::CUDADeviceContext,
phi::dtype::bfloat16>;
template struct SetConstant<paddle::platform::CUDADeviceContext, float>;
template struct SetConstant<paddle::platform::CUDADeviceContext, double>;
template struct SetConstant<paddle::platform::CUDADeviceContext, uint8_t>;
template struct SetConstant<paddle::platform::CUDADeviceContext, int>;
template struct SetConstant<paddle::platform::CUDADeviceContext, int16_t>;
template struct SetConstant<paddle::platform::CUDADeviceContext, int64_t>;
template struct SetConstant<paddle::platform::CUDADeviceContext, bool>;
template struct SetConstant<paddle::platform::CUDADeviceContext,
phi::dtype::complex<float>>;
template struct SetConstant<paddle::platform::CUDADeviceContext,
phi::dtype::complex<double>>;
template struct SetConstant<phi::GPUContext, phi::dtype::float16>;
template struct SetConstant<phi::GPUContext, phi::dtype::bfloat16>;
template struct SetConstant<phi::GPUContext, float>;
......@@ -75,44 +59,18 @@ template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
phi::dtype::complex<double>>;
#define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<paddle::platform::CUDADeviceContext, bool, RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, float, RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
double, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
float16, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
bfloat16, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int8_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int32_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int64_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
phi::dtype::complex<float>, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
phi::dtype::complex<double>, \
RANK>; \
template struct Transpose<phi::GPUContext, bool, RANK>; \
template struct Transpose<phi::GPUContext, float, RANK>; \
template struct Transpose<phi::GPUContext, double, RANK>; \
template struct Transpose<phi::GPUContext, float16, RANK>; \
template struct Transpose<phi::GPUContext, bfloat16, RANK>; \
template struct Transpose<phi::GPUContext, int8_t, RANK>; \
template struct Transpose<phi::GPUContext, int32_t, RANK>; \
template struct Transpose<phi::GPUContext, int64_t, RANK>; \
template struct Transpose<phi::GPUContext, \
phi::dtype::complex<float>, \
RANK>; \
#define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<phi::GPUContext, bool, RANK>; \
template struct Transpose<phi::GPUContext, float, RANK>; \
template struct Transpose<phi::GPUContext, double, RANK>; \
template struct Transpose<phi::GPUContext, float16, RANK>; \
template struct Transpose<phi::GPUContext, bfloat16, RANK>; \
template struct Transpose<phi::GPUContext, int8_t, RANK>; \
template struct Transpose<phi::GPUContext, int32_t, RANK>; \
template struct Transpose<phi::GPUContext, int64_t, RANK>; \
template struct Transpose<phi::GPUContext, \
phi::dtype::complex<float>, \
RANK>; \
template struct Transpose<phi::GPUContext, phi::dtype::complex<double>, RANK>;
DEFINE_GPU_TRANS(1);
......@@ -240,8 +198,7 @@ struct TransposeNormal<phi::GPUContext, T> {
};
// define transpose normal
#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<paddle::platform::CUDADeviceContext, TYPE>; \
#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<phi::GPUContext, TYPE>
DEFINE_GPU_TRANS_NORMAL(float16);
......
......@@ -131,10 +131,5 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
template class MatrixInverseFunctor<GPUContext, float>;
template class MatrixInverseFunctor<GPUContext, double>;
// TODO(chenweihang): remove these instantiations later
template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext, float>;
template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext,
double>;
} // namespace funcs
} // namespace phi
......@@ -170,9 +170,5 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
template class MatrixSolveFunctor<GPUContext, float>;
template class MatrixSolveFunctor<GPUContext, double>;
// TODO(wuweilong): remove these instantiations later
template class MatrixSolveFunctor<paddle::platform::CUDADeviceContext, float>;
template class MatrixSolveFunctor<paddle::platform::CUDADeviceContext, double>;
} // namespace funcs
} // namespace phi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册