未验证 提交 88490567 编写于 作者: L Leo Chen 提交者: GitHub

unify fluid::CUDADeviceContext and phi::GpuContext (#44723)

* remove cudaDeviceContext

* remove more template

* fix rocm compile
上级 0a2db7c8
......@@ -23,12 +23,6 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
class GarbageCollector;
......
......@@ -25,11 +25,6 @@
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
namespace memory {
namespace allocation {
......
......@@ -23,13 +23,6 @@ limitations under the License. */
#include "paddle/fluid/operators/miopen_lstm_cache.h"
#endif
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
......
......@@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
#endif
size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
// first sum pool
FusedSeqpoolKernelNormal<<<config.block_per_grid.x,
config.thread_per_block.x,
......@@ -209,7 +209,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
// not need show click input
N = static_cast<size_t>(batch_size * slot_num *
(embedding_size - cvm_offset));
platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(dev_ctx, N);
FusedCVMKernelNoCVM<<<config.block_per_grid.x,
config.thread_per_block.x,
0,
......@@ -391,7 +392,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
#endif
size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
auto config = GetGpuLaunchConfig1D(dev_ctx, N);
auto config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
if (use_cvm) {
// join grad
FusedSeqpoolCVMGradKernelWithCVM<<<config.block_per_grid.x,
......
......@@ -14,13 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/gru_op.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
......
......@@ -150,11 +150,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
}
}
template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
template class CrossEntropyFunctor<platform::CUDADeviceContext,
platform::float16>;
template class CrossEntropyFunctor<phi::GPUContext, float>;
template class CrossEntropyFunctor<phi::GPUContext, double>;
template class CrossEntropyFunctor<phi::GPUContext, platform::float16>;
......
......@@ -308,24 +308,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
}
};
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
double>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
phi::GPUContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
phi::GPUContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
float>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
phi::GPUContext,
float>;
......@@ -576,12 +564,6 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
}
};
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
double>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
phi::GPUContext,
float>;
......@@ -589,12 +571,6 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
phi::GPUContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
float>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
phi::GPUContext,
float>;
......
......@@ -173,12 +173,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
axis);
}
template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
template class MaxOutFunctor<platform::CUDADeviceContext, float>;
template class MaxOutFunctor<platform::CUDADeviceContext, double>;
template class MaxOutGradFunctor<phi::GPUContext, float>;
template class MaxOutGradFunctor<phi::GPUContext, double>;
......
......@@ -22,12 +22,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/sampler.h"
#include "paddle/phi/core/ddim.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
namespace math {
......
......@@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
}
} // namespace
template <typename T>
struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input1,
const framework::Tensor& input2,
framework::Tensor* output) {
auto in1_height = input1.height();
auto in2_dims = input2.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_EQ(
in1_height,
in2_dims[0],
platform::errors::InvalidArgument(
"The two inputs height must be equal."
"But received first input height = [%d], first input height = [%d]",
in1_height,
in2_dims[0]));
PADDLE_ENFORCE_EQ(
in1_height,
out_dims[0],
platform::errors::InvalidArgument(
"The input and output height must be equal."
"But received input height = [%d], output height = [%d]",
in1_height,
out_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(
in1_row_numel,
input2.numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]",
in1_row_numel,
input2.numel() / in1_height));
PADDLE_ENFORCE_EQ(
in1_row_numel,
output->numel() / in1_height,
platform::errors::InvalidArgument(
"The input and output width must be equal."
"But received input width = [%d], output width = [%d]",
in1_row_numel,
output->numel() / in1_height));
auto* in1_data = in1_value.data<T>();
auto* in2_data = input2.data<T>();
auto* out_data = output->data<T>();
phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
functor(context, output, static_cast<T>(0));
const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid(in1_rows.size(), 1);
paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
SelectedRowsAddTensorKernel<T, block_size>
<<<grid, threads, 0, context.stream()>>>(
in1_data,
mixv_in1_rows.CUDAData(context.GetPlace()),
out_data,
in1_row_numel);
auto out_eigen = framework::EigenVector<T>::Flatten(*output);
auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
}
};
template <typename T>
struct SelectedRowsAddTensor<phi::GPUContext, T> {
void operator()(const phi::GPUContext& context,
......@@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
}
};
template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
platform::float16>;
template struct SelectedRowsAddTensor<phi::GPUContext, float>;
template struct SelectedRowsAddTensor<phi::GPUContext, double>;
template struct SelectedRowsAdd<phi::GPUContext, platform::float16>;
......@@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
}
} // namespace
template <typename T>
struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input1,
framework::Tensor* input2) {
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(
in1_height,
in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But received first input height = "
"[%d], second input height = [%d]",
in1_height,
in2_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(
in1_row_numel,
input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]",
in1_row_numel,
input2->numel() / in1_height));
auto* in1_data = in1_value.data<T>();
auto* in2_data = input2->data<T>();
const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid(in1_rows.size(), 1);
paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
SelectedRowsAddToTensorKernel<T, block_size>
<<<grid, threads, 0, context.stream()>>>(
in1_data,
mixv_in1_rows.CUDAData(context.GetPlace()),
in2_data,
in1_row_numel);
}
};
template <typename T>
struct SelectedRowsAddToTensor<phi::GPUContext, T> {
void operator()(const phi::GPUContext& context,
......@@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
}
};
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
platform::float16>;
template struct SelectedRowsAddToTensor<phi::GPUContext, float>;
template struct SelectedRowsAddToTensor<phi::GPUContext, double>;
template struct SelectedRowsAddToTensor<phi::GPUContext, int>;
......@@ -625,34 +498,6 @@ struct MergeAddImpl {
}
};
template <typename T>
struct MergeAdd<platform::CUDADeviceContext, T> {
// unary functor, merge by adding duplicated rows in
// the input SelectedRows object.
phi::SelectedRows operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input,
const bool sorted_result) {
return MergeAddImpl<platform::CUDADeviceContext, T>()(
context, input, sorted_result);
}
void operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input,
phi::SelectedRows* output,
const bool sorted_result) {
MergeAddImpl<platform::CUDADeviceContext, T>()(
context, input, output, sorted_result);
}
void operator()(const platform::CUDADeviceContext& context,
const std::vector<const phi::SelectedRows*>& inputs,
phi::SelectedRows* output,
const bool sorted_result) {
MergeAddImpl<platform::CUDADeviceContext, T>()(
context, inputs, output, sorted_result);
}
};
template <typename T>
struct MergeAdd<phi::GPUContext, T> {
// unary functor, merge by adding duplicated rows in
......@@ -678,10 +523,8 @@ struct MergeAdd<phi::GPUContext, T> {
}
};
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<platform::CUDADeviceContext, dtype>; \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<platform::CUDADeviceContext, dtype>; \
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<phi::GPUContext, dtype>;
TEMPLATE_SPECIALIZED_FOR_MERGEADD(float)
......
......@@ -57,88 +57,6 @@ __global__ void SequencePaddingKernel(T* dst,
}
}
template <typename T>
class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::LoDTensor& seq_tensor,
framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value,
int pad_seq_len = -1,
int lod_level = 0,
bool norm_by_times = false,
const PadLayout layout = kBatchLengthWidth) {
auto seq_lod = seq_tensor.lod();
auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
const auto& seq_tensor_dims = seq_tensor.dims();
const auto& pad_tensor_dims = pad_tensor->dims();
int max_seq_len = MaximumSequenceLength(seq_offsets);
if (pad_seq_len == -1) {
pad_seq_len = max_seq_len;
}
PADDLE_ENFORCE_GE(
pad_seq_len,
max_seq_len,
platform::errors::InvalidArgument(
"The pad_seq_len must be equal to or greater than the "
"original max sequence length. Expected %ld >= %ld, but got %ld < "
"%ld. Please check the input value.",
pad_seq_len,
max_seq_len,
pad_seq_len,
max_seq_len));
int step_width = seq_tensor.numel() / seq_tensor_dims[0];
int seq_num = seq_offsets.size() - 1;
CheckDims(seq_tensor_dims,
pad_tensor_dims,
seq_offsets,
pad_seq_len,
step_width,
layout);
PADDLE_ENFORCE_EQ(
pad_value.numel() == 1 || pad_value.numel() == step_width,
true,
platform::errors::InvalidArgument(
"The numel of 'pad_value' can only be 1 or be equal to "
"the 'step_width', but got %ld != 1 and %ld. Please check the "
"input value.",
pad_value.numel(),
step_width));
const int kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t block_dim_x =
std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = seq_num;
dim3 grid(grid_dim_x, grid_dim_y);
const T* seq_data = seq_tensor.data<T>();
T* pad_data = pad_tensor->data<T>();
const T* pad_value_data = pad_value.data<T>();
paddle::framework::MixVector<size_t> mix_vector_seq_offsets(&seq_offsets);
SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
pad_data,
seq_data,
pad_value_data,
pad_value.numel() == 1,
mix_vector_seq_offsets.CUDAData(context.GetPlace()),
seq_num,
pad_seq_len,
step_width,
norm_by_times,
layout);
}
};
template <typename T>
class PaddingLoDTensorFunctor<phi::GPUContext, T> {
public:
......@@ -221,73 +139,6 @@ class PaddingLoDTensorFunctor<phi::GPUContext, T> {
}
};
template <typename T>
class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor,
int pad_seq_len = -1,
int lod_level = 0,
bool norm_by_times = false,
const PadLayout layout = kBatchLengthWidth) {
auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
const auto& seq_tensor_dims = seq_tensor->dims();
const auto& pad_tensor_dims = pad_tensor.dims();
int max_seq_len = MaximumSequenceLength(seq_offsets);
if (pad_seq_len == -1) {
pad_seq_len = max_seq_len;
}
int step_width = seq_tensor->numel() / seq_tensor_dims[0];
int seq_num = seq_offsets.size() - 1;
CheckDims(seq_tensor_dims,
pad_tensor_dims,
seq_offsets,
pad_seq_len,
step_width,
layout);
/*
if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
paddle::framework::TensorCopy(pad_tensor, context.GetPlace(), context,
seq_tensor);
seq_tensor->Resize(seq_tensor_dims);
return;
}
*/
const int kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t block_dim_x =
std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = seq_num;
dim3 grid(grid_dim_x, grid_dim_y);
const T* pad_data = pad_tensor.data<T>();
T* seq_data = seq_tensor->data<T>();
paddle::framework::MixVector<size_t> mixv_seq_offsets(&seq_offsets);
SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
seq_data,
pad_data,
nullptr,
false,
mixv_seq_offsets.CUDAData(context.GetPlace()),
seq_num,
pad_seq_len,
step_width,
norm_by_times,
layout);
}
};
template <typename T>
class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
public:
......@@ -355,16 +206,6 @@ class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
}
};
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
template class PaddingLoDTensorFunctor<phi::GPUContext, int>;
template class PaddingLoDTensorFunctor<phi::GPUContext, int64_t>;
template class PaddingLoDTensorFunctor<phi::GPUContext, float>;
......
......@@ -35,43 +35,6 @@ __global__ void SequenceScaleKernel(T* seq,
}
}
template <typename T>
class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const T* scales,
framework::LoDTensor* seq) {
const size_t level = 0;
auto lod = seq->lod();
const size_t num_seq = lod[level].size() - 1;
const size_t seq_width = seq->numel() / seq->dims()[0];
auto abs_offset_lod = framework::ToAbsOffset(lod);
T* seq_data = seq->mutable_data<T>(context.GetPlace());
paddle::framework::MixVector<size_t> mix_vector(&(abs_offset_lod[level]));
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(
HIP_KERNEL_NAME(SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>),
dim3(num_seq),
dim3(PADDLE_CUDA_NUM_THREADS),
0,
context.stream(),
seq_data,
mix_vector.CUDAMutableData(context.GetPlace()),
scales,
seq_width);
#else
SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>
<<<num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
seq_data,
mix_vector.CUDAMutableData(context.GetPlace()),
scales,
seq_width);
#endif
mix_vector.CopyToCPU();
}
};
template <typename T>
class ScaleLoDTensorFunctor<phi::GPUContext, T> {
public:
......@@ -109,9 +72,6 @@ class ScaleLoDTensorFunctor<phi::GPUContext, T> {
}
};
template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, double>;
template class ScaleLoDTensorFunctor<phi::GPUContext, float>;
template class ScaleLoDTensorFunctor<phi::GPUContext, double>;
......
......@@ -141,56 +141,21 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
#endif
}
template class SoftmaxCUDNNFunctor<float, platform::CUDADeviceContext>;
template class SoftmaxCUDNNFunctor<platform::float16,
platform::CUDADeviceContext>;
template class SoftmaxGradCUDNNFunctor<float, platform::CUDADeviceContext>;
template class SoftmaxGradCUDNNFunctor<platform::float16,
platform::CUDADeviceContext>;
template class SoftmaxCUDNNFunctor<float, phi::GPUContext>;
template class SoftmaxCUDNNFunctor<platform::float16, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<float, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<platform::float16, phi::GPUContext>;
#if CUDNN_VERSION_MIN(8, 1, 0)
template class SoftmaxCUDNNFunctor<platform::bfloat16,
platform::CUDADeviceContext>;
template class SoftmaxGradCUDNNFunctor<platform::bfloat16,
platform::CUDADeviceContext>;
template class SoftmaxCUDNNFunctor<platform::bfloat16, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<platform::bfloat16, phi::GPUContext>;
#endif
// MIOPEN do not support double
#ifndef PADDLE_WITH_HIP
template class SoftmaxCUDNNFunctor<double, platform::CUDADeviceContext>;
template class SoftmaxGradCUDNNFunctor<double, platform::CUDADeviceContext>;
template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
#endif
template class SoftmaxFunctor<platform::CUDADeviceContext,
platform::float16,
false>;
template class SoftmaxFunctor<platform::CUDADeviceContext,
platform::float16,
true>;
template class SoftmaxFunctor<platform::CUDADeviceContext,
platform::bfloat16,
false>;
template class SoftmaxFunctor<platform::CUDADeviceContext,
platform::bfloat16,
true>;
template class SoftmaxFunctor<platform::CUDADeviceContext, float, false>;
template class SoftmaxFunctor<platform::CUDADeviceContext, double, false>;
template class SoftmaxFunctor<platform::CUDADeviceContext, float, true>;
template class SoftmaxFunctor<platform::CUDADeviceContext, double, true>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext,
platform::float16>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext,
platform::bfloat16>;
template class SoftmaxFunctor<phi::GPUContext, platform::float16, false>;
template class SoftmaxFunctor<phi::GPUContext, platform::float16, true>;
template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16, false>;
......
......@@ -417,13 +417,9 @@ void Col2VolFunctor<DeviceContext, T>::operator()(
}
// };
template class Vol2ColFunctor<platform::CUDADeviceContext, float>;
template class Vol2ColFunctor<platform::CUDADeviceContext, double>;
template class Vol2ColFunctor<phi::GPUContext, float>;
template class Vol2ColFunctor<phi::GPUContext, double>;
template class Col2VolFunctor<platform::CUDADeviceContext, float>;
template class Col2VolFunctor<platform::CUDADeviceContext, double>;
template class Col2VolFunctor<phi::GPUContext, float>;
template class Col2VolFunctor<phi::GPUContext, double>;
......
......@@ -16,12 +16,6 @@
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
sequence_concat,
paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,
......
......@@ -51,7 +51,6 @@ namespace platform {
//
// The NCCLComm instance is created and reversed in the NCCLCommContext
// singleton with a global user specified group id.
class CUDADeviceContext;
class NCCLComm {
public:
......
......@@ -533,11 +533,6 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
}
CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
: phi::GPUContext(place) {}
CUDADeviceContext::~CUDADeviceContext() = default;
CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice());
}
......
......@@ -271,15 +271,7 @@ struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
class CudnnWorkspaceHandle;
class EigenCudaStreamDevice;
class CUDADeviceContext : public phi::GPUContext {
public:
explicit CUDADeviceContext(CUDAPlace place);
virtual ~CUDADeviceContext();
private:
int place_holder_; // TO BE REMOVED
DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
};
using CUDADeviceContext = phi::GPUContext;
class CudnnWorkspaceHandle {
public:
......
......@@ -96,66 +96,6 @@ struct Transform<phi::CPUContext> {
};
#if defined(__NVCC__) || defined(__HIPCC__)
template <>
struct Transform<platform::CUDADeviceContext> {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const platform::CUDADeviceContext& context,
InputIter first,
InputIter last,
OutputIter result,
UnaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE_EQ(is_gpu_place(place),
true,
platform::errors::PreconditionNotMet(
"The CUDA Transform must be used in GPU place."));
#ifdef __HIPCC__
thrust::transform(thrust::hip::par.on(context.stream()),
details::CastToCUDATransformIterator(first),
details::CastToCUDATransformIterator(last),
details::CastToCUDATransformIterator(result),
op);
#else
thrust::transform(thrust::cuda::par.on(context.stream()),
details::CastToCUDATransformIterator(first),
details::CastToCUDATransformIterator(last),
details::CastToCUDATransformIterator(result),
op);
#endif
}
template <typename InputIter1,
typename InputIter2,
typename OutputIter,
typename BinaryOperation>
void operator()(const platform::CUDADeviceContext& context,
InputIter1 first1,
InputIter1 last1,
InputIter2 first2,
OutputIter result,
BinaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE_EQ(is_gpu_place(place),
true,
platform::errors::PreconditionNotMet(
"The CUDA Transform must be used in GPU place."));
#ifdef __HIPCC__
thrust::transform(thrust::hip::par.on(context.stream()),
details::CastToCUDATransformIterator(first1),
details::CastToCUDATransformIterator(last1),
details::CastToCUDATransformIterator(first2),
details::CastToCUDATransformIterator(result),
op);
#else
thrust::transform(thrust::cuda::par.on(context.stream()),
details::CastToCUDATransformIterator(first1),
details::CastToCUDATransformIterator(last1),
details::CastToCUDATransformIterator(first2),
details::CastToCUDATransformIterator(result),
op);
#endif
}
};
template <>
struct Transform<phi::GPUContext> {
......
......@@ -313,10 +313,6 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
AddReluKernel(context.stream(), M, N, Y, B, relu);
}
template class FCFunctor<paddle::platform::CUDADeviceContext, float16>;
template class FCFunctor<paddle::platform::CUDADeviceContext, float>;
template class FCFunctor<paddle::platform::CUDADeviceContext, double>;
template class FCFunctor<GPUContext, float16>;
template class FCFunctor<GPUContext, float>;
template class FCFunctor<GPUContext, double>;
......
......@@ -91,22 +91,6 @@ struct ForRange<phi::GPUContext> {
size_t limit_;
};
// NOTE: After the pten kernel is migrated, it needs to be deleted.
template <>
struct ForRange<paddle::platform::CUDADeviceContext> {
ForRange(const paddle::platform::CUDADeviceContext& dev_ctx, size_t limit)
: dev_ctx_(dev_ctx), limit_(limit) {}
template <typename Function>
inline void operator()(Function func) const {
phi::funcs::ForRange<phi::GPUContext> for_range(dev_ctx_, limit_);
for_range(func);
}
const paddle::platform::CUDADeviceContext& dev_ctx_;
size_t limit_;
};
#endif
} // namespace funcs
......
......@@ -31,22 +31,6 @@ namespace funcs {
using float16 = phi::dtype::float16;
using bfloat16 = phi::dtype::bfloat16;
template struct SetConstant<paddle::platform::CUDADeviceContext,
phi::dtype::float16>;
template struct SetConstant<paddle::platform::CUDADeviceContext,
phi::dtype::bfloat16>;
template struct SetConstant<paddle::platform::CUDADeviceContext, float>;
template struct SetConstant<paddle::platform::CUDADeviceContext, double>;
template struct SetConstant<paddle::platform::CUDADeviceContext, uint8_t>;
template struct SetConstant<paddle::platform::CUDADeviceContext, int>;
template struct SetConstant<paddle::platform::CUDADeviceContext, int16_t>;
template struct SetConstant<paddle::platform::CUDADeviceContext, int64_t>;
template struct SetConstant<paddle::platform::CUDADeviceContext, bool>;
template struct SetConstant<paddle::platform::CUDADeviceContext,
phi::dtype::complex<float>>;
template struct SetConstant<paddle::platform::CUDADeviceContext,
phi::dtype::complex<double>>;
template struct SetConstant<phi::GPUContext, phi::dtype::float16>;
template struct SetConstant<phi::GPUContext, phi::dtype::bfloat16>;
template struct SetConstant<phi::GPUContext, float>;
......@@ -75,44 +59,18 @@ template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
phi::dtype::complex<double>>;
#define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<paddle::platform::CUDADeviceContext, bool, RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, float, RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
double, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
float16, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
bfloat16, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int8_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int32_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int64_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
phi::dtype::complex<float>, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
phi::dtype::complex<double>, \
RANK>; \
template struct Transpose<phi::GPUContext, bool, RANK>; \
template struct Transpose<phi::GPUContext, float, RANK>; \
template struct Transpose<phi::GPUContext, double, RANK>; \
template struct Transpose<phi::GPUContext, float16, RANK>; \
template struct Transpose<phi::GPUContext, bfloat16, RANK>; \
template struct Transpose<phi::GPUContext, int8_t, RANK>; \
template struct Transpose<phi::GPUContext, int32_t, RANK>; \
template struct Transpose<phi::GPUContext, int64_t, RANK>; \
template struct Transpose<phi::GPUContext, \
phi::dtype::complex<float>, \
RANK>; \
#define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<phi::GPUContext, bool, RANK>; \
template struct Transpose<phi::GPUContext, float, RANK>; \
template struct Transpose<phi::GPUContext, double, RANK>; \
template struct Transpose<phi::GPUContext, float16, RANK>; \
template struct Transpose<phi::GPUContext, bfloat16, RANK>; \
template struct Transpose<phi::GPUContext, int8_t, RANK>; \
template struct Transpose<phi::GPUContext, int32_t, RANK>; \
template struct Transpose<phi::GPUContext, int64_t, RANK>; \
template struct Transpose<phi::GPUContext, \
phi::dtype::complex<float>, \
RANK>; \
template struct Transpose<phi::GPUContext, phi::dtype::complex<double>, RANK>;
DEFINE_GPU_TRANS(1);
......@@ -240,8 +198,7 @@ struct TransposeNormal<phi::GPUContext, T> {
};
// define transpose normal
#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<paddle::platform::CUDADeviceContext, TYPE>; \
#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<phi::GPUContext, TYPE>
DEFINE_GPU_TRANS_NORMAL(float16);
......
......@@ -131,10 +131,5 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
template class MatrixInverseFunctor<GPUContext, float>;
template class MatrixInverseFunctor<GPUContext, double>;
// TODO(chenweihang): remove these instantiations later
template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext, float>;
template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext,
double>;
} // namespace funcs
} // namespace phi
......@@ -170,9 +170,5 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
template class MatrixSolveFunctor<GPUContext, float>;
template class MatrixSolveFunctor<GPUContext, double>;
// TODO(wuweilong): remove these instantiations later
template class MatrixSolveFunctor<paddle::platform::CUDADeviceContext, float>;
template class MatrixSolveFunctor<paddle::platform::CUDADeviceContext, double>;
} // namespace funcs
} // namespace phi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册