未验证 提交 86763023 编写于 作者: L Leo Chen 提交者: GitHub

unify gpu context (#44740)

* remove cudaDeviceContext

* remove more template

* fix rocm compile

* remove alias name CUDADeviceContext

* fix compile

* fix tests

* revert changes
上级 f15d930a
...@@ -94,7 +94,7 @@ class NPUEventManager { ...@@ -94,7 +94,7 @@ class NPUEventManager {
PADDLE_ENFORCE_EQ(device_index, PADDLE_ENFORCE_EQ(device_index,
device_index_, device_index_,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"CUDADeviceContext's device %d does not match" "phi::GPUContext's device %d does not match"
"Event's device %d", "Event's device %d",
device_index, device_index,
device_index_)); device_index_));
......
...@@ -104,7 +104,7 @@ class EventManager { ...@@ -104,7 +104,7 @@ class EventManager {
bool DeviceId() const { return device_index_; } bool DeviceId() const { return device_index_; }
gpuEvent_t GetRawCudaEvent() const { return event_; } gpuEvent_t GetRawCudaEvent() const { return event_; }
void Record(const paddle::platform::CUDADeviceContext& ctx) { void Record(const phi::GPUContext& ctx) {
auto device_index = ctx.GetPlace().device; auto device_index = ctx.GetPlace().device;
if (!is_created_) { if (!is_created_) {
CreateEvent(device_index); CreateEvent(device_index);
...@@ -112,7 +112,7 @@ class EventManager { ...@@ -112,7 +112,7 @@ class EventManager {
PADDLE_ENFORCE_EQ(device_index, PADDLE_ENFORCE_EQ(device_index,
device_index_, device_index_,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"CUDADeviceContext's device %d does not match" "phi::GPUContext's device %d does not match"
"Event's device %d", "Event's device %d",
device_index, device_index,
device_index_)); device_index_));
...@@ -157,13 +157,13 @@ class EventManager { ...@@ -157,13 +157,13 @@ class EventManager {
} }
} }
void Block(const paddle::platform::CUDADeviceContext& ctx) const { void Block(const phi::GPUContext& ctx) const {
if (is_created_) { if (is_created_) {
auto device_index = ctx.GetPlace().device; auto device_index = ctx.GetPlace().device;
PADDLE_ENFORCE_EQ(device_index, PADDLE_ENFORCE_EQ(device_index,
device_index_, device_index_,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"CUDADeviceContext's device %d does not match" "phi::GPUContext's device %d does not match"
"Event's device %d", "Event's device %d",
device_index, device_index,
device_index_)); device_index_));
......
...@@ -31,10 +31,10 @@ namespace distributed { ...@@ -31,10 +31,10 @@ namespace distributed {
void SyncDefaultStream( void SyncDefaultStream(
const std::vector<Place>& places, const std::vector<Place>& places,
std::vector<EventManager>& ncclEvents, // NOLINT std::vector<EventManager>& ncclEvents, // NOLINT
std::vector<std::unique_ptr<CUDADeviceContext>>& dev_ctx) { // NOLINT std::vector<std::unique_ptr<phi::GPUContext>>& dev_ctx) { // NOLINT
for (size_t i = 0; i < places.size(); ++i) { for (size_t i = 0; i < places.size(); ++i) {
auto* default_ctx = static_cast<platform::CUDADeviceContext*>( auto* default_ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(places[i])); platform::DeviceContextPool::Instance().Get(places[i]));
ncclEvents[i].Record(*default_ctx); ncclEvents[i].Record(*default_ctx);
ncclEvents[i].Block(*dev_ctx[i]); ncclEvents[i].Block(*dev_ctx[i]);
...@@ -69,7 +69,7 @@ void ProcessGroupNCCL::NCCLTask::SetOutputs( ...@@ -69,7 +69,7 @@ void ProcessGroupNCCL::NCCLTask::SetOutputs(
void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() { void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() {
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
auto* default_ctx = static_cast<platform::CUDADeviceContext*>( auto* default_ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(places_[i])); platform::DeviceContextPool::Instance().Get(places_[i]));
default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent()); default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent());
} }
...@@ -201,7 +201,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( ...@@ -201,7 +201,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
<< ", place: " << places_key << ", place: " << places_key
<< ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id); << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);
std::vector<std::unique_ptr<CUDADeviceContext>> dev_ctx; std::vector<std::unique_ptr<phi::GPUContext>> dev_ctx;
dev_ctx.resize(places.size()); dev_ctx.resize(places.size());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
...@@ -209,7 +209,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( ...@@ -209,7 +209,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
for (size_t i = 0; i < places.size(); ++i) { for (size_t i = 0; i < places.size(); ++i) {
platform::CUDADeviceGuard guard(places[i]); platform::CUDADeviceGuard guard(places[i]);
nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id); nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id);
dev_ctx[i].reset(new CUDADeviceContext(places[i])); dev_ctx[i].reset(new phi::GPUContext(places[i]));
} }
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
......
...@@ -45,7 +45,6 @@ namespace paddle { ...@@ -45,7 +45,6 @@ namespace paddle {
namespace distributed { namespace distributed {
using Place = paddle::platform::Place; using Place = paddle::platform::Place;
using CUDADeviceContext = paddle::platform::CUDADeviceContext;
class ProcessGroupNCCL : public ProcessGroup { class ProcessGroupNCCL : public ProcessGroup {
public: public:
...@@ -174,8 +173,7 @@ class ProcessGroupNCCL : public ProcessGroup { ...@@ -174,8 +173,7 @@ class ProcessGroupNCCL : public ProcessGroup {
std::unordered_map<std::string, std::vector<EventManager>> places_to_events_; std::unordered_map<std::string, std::vector<EventManager>> places_to_events_;
std::unordered_map<std::string, std::unordered_map<std::string, std::vector<std::unique_ptr<phi::GPUContext>>>
std::vector<std::unique_ptr<CUDADeviceContext>>>
places_to_ctx_; places_to_ctx_;
std::set<int> used_place_ids_; std::set<int> used_place_ids_;
......
...@@ -241,7 +241,7 @@ static void SplitTensorsWithType(const DeviceContext &context, ...@@ -241,7 +241,7 @@ static void SplitTensorsWithType(const DeviceContext &context,
void EagerGroup::ConcatTensors(const platform::Place &place) { void EagerGroup::ConcatTensors(const platform::Place &place) {
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto *default_ctx = static_cast<platform::CUDADeviceContext *>( auto *default_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
ConcatTensorsWithType( ConcatTensorsWithType(
*default_ctx, dense_tensors_, &dense_contents_, dtype_); *default_ctx, dense_tensors_, &dense_contents_, dtype_);
...@@ -264,7 +264,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) { ...@@ -264,7 +264,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
void EagerGroup::SplitTensors(const platform::Place &place) { void EagerGroup::SplitTensors(const platform::Place &place) {
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto *default_ctx = static_cast<platform::CUDADeviceContext *>( auto *default_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
SplitTensorsWithType( SplitTensorsWithType(
*default_ctx, &dense_contents_, &dense_tensors_, dtype_); *default_ctx, &dense_contents_, &dense_tensors_, dtype_);
...@@ -883,7 +883,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group, ...@@ -883,7 +883,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_); auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_);
if (platform::is_gpu_place(inner_place_)) { if (platform::is_gpu_place(inner_place_)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
dev_ctx = static_cast<platform::CUDADeviceContext *>( dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(inner_place_)); platform::DeviceContextPool::Instance().Get(inner_place_));
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
......
...@@ -78,8 +78,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data, ...@@ -78,8 +78,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
VLOG(3) << "Loading data for GPU."; VLOG(3) << "Loading data for GPU.";
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = auto *dev_ctx = dynamic_cast<const phi::GPUContext *>(pool.Get(place));
dynamic_cast<const platform::CUDADeviceContext *>(pool.Get(place));
auto gpu_place = place; auto gpu_place = place;
memory::Copy(gpu_place, memory::Copy(gpu_place,
static_cast<void *>(input_tensor_ptr), static_cast<void *>(input_tensor_ptr),
......
...@@ -119,8 +119,7 @@ void SerializeLodTensor(framework::Variable* var, ...@@ -119,8 +119,7 @@ void SerializeLodTensor(framework::Variable* var,
char* temp_ptr = char* temp_ptr =
new char[tensor->numel() * new char[tensor->numel() *
framework::DataTypeSize(tensor->dtype())]; // NOLINT framework::DataTypeSize(tensor->dtype())]; // NOLINT
auto stream = auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy( memory::Copy(
platform::CPUPlace(), platform::CPUPlace(),
temp_ptr, temp_ptr,
...@@ -168,8 +167,7 @@ void SerializeSelectedRows(framework::Variable* var, ...@@ -168,8 +167,7 @@ void SerializeSelectedRows(framework::Variable* var,
char* temp_ptr = char* temp_ptr =
new char[tensor->numel() * new char[tensor->numel() *
framework::DataTypeSize(tensor->dtype())]; // NOLINT framework::DataTypeSize(tensor->dtype())]; // NOLINT
auto stream = auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy( memory::Copy(
platform::CPUPlace(), platform::CPUPlace(),
temp_ptr, temp_ptr,
...@@ -265,8 +263,7 @@ void DeserializeLodTensor(framework::Variable* var, ...@@ -265,8 +263,7 @@ void DeserializeLodTensor(framework::Variable* var,
framework::DataTypeSize(tensor->dtype())]; // NOLINT framework::DataTypeSize(tensor->dtype())]; // NOLINT
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len); // NOLINT io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len); // NOLINT
auto stream = auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(place, memory::Copy(place,
tensor_data, tensor_data,
platform::CPUPlace(), platform::CPUPlace(),
...@@ -311,8 +308,7 @@ void DeserializeSelectedRows( ...@@ -311,8 +308,7 @@ void DeserializeSelectedRows(
unsigned long data_len; // NOLINT unsigned long data_len; // NOLINT
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
io_buffer_itr.copy_and_forward(temp_ptr, data_len); io_buffer_itr.copy_and_forward(temp_ptr, data_len);
auto stream = auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(place, memory::Copy(place,
tensor_data, tensor_data,
platform::CPUPlace(), platform::CPUPlace(),
......
...@@ -43,8 +43,7 @@ int GetMicroId(const platform::DeviceContext& ctx, ...@@ -43,8 +43,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
std::vector<char> temp; std::vector<char> temp;
temp.resize(tensor->numel() * framework::DataTypeSize(tensor->dtype())); temp.resize(tensor->numel() * framework::DataTypeSize(tensor->dtype()));
char* temp_ptr = temp.data(); char* temp_ptr = temp.data();
auto stream = auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(platform::CPUPlace(), memory::Copy(platform::CPUPlace(),
temp_ptr, temp_ptr,
tensor->place(), tensor->place(),
......
...@@ -134,21 +134,20 @@ void ScaleAPI(const paddle::experimental::Tensor& x, ...@@ -134,21 +134,20 @@ void ScaleAPI(const paddle::experimental::Tensor& x,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} else if (expected_kernel_place == paddle::platform::CUDAPlace()) { } else if (expected_kernel_place == paddle::platform::CUDAPlace()) {
auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>( auto* dev_ctx =
pool.Get(expected_kernel_place)); dynamic_cast<phi::GPUContext*>(pool.Get(expected_kernel_place));
if (!dev_ctx) { if (!dev_ctx) {
PADDLE_THROW(paddle::platform::errors::Fatal( PADDLE_THROW(paddle::platform::errors::Fatal(
"Cannot convert device_context to CUDADeviceContext." "Cannot convert device_context to CUDADeviceContext."
"This indicates backend mismatch." "This indicates backend mismatch."
"Pleas double check your expected place")); "Pleas double check your expected place"));
} }
ScaleDeviceDispatch<paddle::platform::CUDADeviceContext>( ScaleDeviceDispatch<phi::GPUContext>(*dense_tensor.get(),
*dense_tensor.get(), *dev_ctx,
*dev_ctx, scale,
scale, bias,
bias, bias_after_scale,
bias_after_scale, dense_out.get());
dense_out.get());
#endif #endif
} else { } else {
PADDLE_THROW(paddle::platform::errors::Fatal( PADDLE_THROW(paddle::platform::errors::Fatal(
......
...@@ -38,8 +38,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) { ...@@ -38,8 +38,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
auto& place = dense_tensor->place(); auto& place = dense_tensor->place();
if (paddle::platform::is_gpu_place(place)) { if (paddle::platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::framework::details::tensor_check< paddle::framework::details::tensor_check<phi::GPUContext>(
paddle::platform::CUDADeviceContext>(
api_name, tensor_name, *dense_tensor, place); api_name, tensor_name, *dense_tensor, place);
#else #else
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
......
...@@ -66,8 +66,7 @@ TEST(Benchmark, FluidScaleCUDA) { ...@@ -66,8 +66,7 @@ TEST(Benchmark, FluidScaleCUDA) {
paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
paddle::memory::Copy(place, paddle::memory::Copy(place,
mutable_x, mutable_x,
...@@ -121,8 +120,7 @@ TEST(Benchmark, FluidMatmulCUDA) { ...@@ -121,8 +120,7 @@ TEST(Benchmark, FluidMatmulCUDA) {
paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
auto* x_tensor = X->MutableVar()->GetMutable<framework::LoDTensor>(); auto* x_tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
...@@ -181,8 +179,7 @@ TEST(Benchmark, FluidMLPCUDA) { ...@@ -181,8 +179,7 @@ TEST(Benchmark, FluidMLPCUDA) {
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
std::vector<float> x_src_data(MLP_M * MLP_N, MLP_X_VAL); std::vector<float> x_src_data(MLP_M * MLP_N, MLP_X_VAL);
......
...@@ -171,8 +171,7 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X, ...@@ -171,8 +171,7 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X,
if (place == paddle::platform::CUDAPlace()) { if (place == paddle::platform::CUDAPlace()) {
paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
paddle::memory::Copy(paddle::platform::CPUPlace(), paddle::memory::Copy(paddle::platform::CPUPlace(),
...@@ -204,8 +203,7 @@ static void FluidCheckGradTensorValue( ...@@ -204,8 +203,7 @@ static void FluidCheckGradTensorValue(
if (place == paddle::platform::CUDAPlace()) { if (place == paddle::platform::CUDAPlace()) {
paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
paddle::memory::Copy(paddle::platform::CPUPlace(), paddle::memory::Copy(paddle::platform::CPUPlace(),
......
...@@ -40,8 +40,8 @@ bool CompareGradTensorWithValue(const paddle::experimental::Tensor& target, ...@@ -40,8 +40,8 @@ bool CompareGradTensorWithValue(const paddle::experimental::Tensor& target,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>( auto* dev_ctx =
pool.Get(paddle::platform::CUDAPlace())); dynamic_cast<phi::GPUContext*>(pool.Get(paddle::platform::CUDAPlace()));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
paddle::memory::Copy(paddle::platform::CPUPlace(), paddle::memory::Copy(paddle::platform::CPUPlace(),
...@@ -79,8 +79,8 @@ bool CompareTensorWithValue(const paddle::experimental::Tensor& target, ...@@ -79,8 +79,8 @@ bool CompareTensorWithValue(const paddle::experimental::Tensor& target,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>( auto* dev_ctx =
pool.Get(paddle::platform::CUDAPlace())); dynamic_cast<phi::GPUContext*>(pool.Get(paddle::platform::CUDAPlace()));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
paddle::memory::Copy(paddle::platform::CPUPlace(), paddle::memory::Copy(paddle::platform::CPUPlace(),
......
...@@ -92,9 +92,8 @@ REGISTER_OP_WITHOUT_GRADIENT( ...@@ -92,9 +92,8 @@ REGISTER_OP_WITHOUT_GRADIENT(
paddle::framework::OpKernelTestProtoAndCheckerMaker); paddle::framework::OpKernelTestProtoAndCheckerMaker);
REGISTER_OP_CPU_KERNEL(test_op, REGISTER_OP_CPU_KERNEL(test_op,
paddle::framework::TestKernel<phi::CPUContext, float>); paddle::framework::TestKernel<phi::CPUContext, float>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(test_op,
test_op, paddle::framework::TestKernel<phi::GPUContext, float>);
paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
static void BuildVar(const std::string& param_name, static void BuildVar(const std::string& param_name,
std::initializer_list<const char*> arguments, std::initializer_list<const char*> arguments,
......
...@@ -2809,7 +2809,7 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) { ...@@ -2809,7 +2809,7 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) {
MiniBatchGpuPack::MiniBatchGpuPack(const paddle::platform::Place& place, MiniBatchGpuPack::MiniBatchGpuPack(const paddle::platform::Place& place,
const std::vector<UsedSlotInfo>& infos) { const std::vector<UsedSlotInfo>& infos) {
place_ = place; place_ = place;
stream_ = dynamic_cast<platform::CUDADeviceContext*>( stream_ = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place)) platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
...@@ -2843,7 +2843,7 @@ MiniBatchGpuPack::~MiniBatchGpuPack() {} ...@@ -2843,7 +2843,7 @@ MiniBatchGpuPack::~MiniBatchGpuPack() {}
void MiniBatchGpuPack::reset(const paddle::platform::Place& place) { void MiniBatchGpuPack::reset(const paddle::platform::Place& place) {
place_ = place; place_ = place;
stream_ = dynamic_cast<platform::CUDADeviceContext*>( stream_ = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place)) platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
ins_num_ = 0; ins_num_ = 0;
......
...@@ -89,7 +89,7 @@ void SlotRecordInMemoryDataFeed::FillSlotValueOffset( ...@@ -89,7 +89,7 @@ void SlotRecordInMemoryDataFeed::FillSlotValueOffset(
const int float_slot_size, const int float_slot_size,
const UsedSlotGpuType *used_slots) { const UsedSlotGpuType *used_slots) {
auto stream = auto stream =
dynamic_cast<platform::CUDADeviceContext *>( dynamic_cast<phi::GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(this->place_)) paddle::platform::DeviceContextPool::Instance().Get(this->place_))
->stream(); ->stream();
FillSlotValueOffsetKernel<<<GET_BLOCKS(used_slot_num), FillSlotValueOffsetKernel<<<GET_BLOCKS(used_slot_num),
...@@ -168,7 +168,7 @@ void SlotRecordInMemoryDataFeed::CopyForTensor( ...@@ -168,7 +168,7 @@ void SlotRecordInMemoryDataFeed::CopyForTensor(
const int float_slot_size, const int float_slot_size,
const UsedSlotGpuType *used_slots) { const UsedSlotGpuType *used_slots) {
auto stream = auto stream =
dynamic_cast<platform::CUDADeviceContext *>( dynamic_cast<phi::GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(this->place_)) paddle::platform::DeviceContextPool::Instance().Get(this->place_))
->stream(); ->stream();
......
...@@ -103,8 +103,8 @@ struct CastDataType { ...@@ -103,8 +103,8 @@ struct CastDataType {
CastDataTypeFunctor<InType, OutType>()); CastDataTypeFunctor<InType, OutType>());
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
} else if (platform::is_gpu_place(in_.place())) { } else if (platform::is_gpu_place(in_.place())) {
platform::Transform<platform::CUDADeviceContext> trans; platform::Transform<phi::GPUContext> trans;
auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_); auto* context = static_cast<const phi::GPUContext*>(ctx_);
trans(*context, trans(*context,
in_begin, in_begin,
in_end, in_end,
......
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
TEST(DataTypeTransform, GPUTransform) { TEST(DataTypeTransform, GPUTransform) {
auto cpu_place = paddle::platform::CPUPlace(); auto cpu_place = paddle::platform::CPUPlace();
auto gpu_place = paddle::platform::CUDAPlace(0); auto gpu_place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(gpu_place); phi::GPUContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream()) .GetAllocator(gpu_place, context.stream())
.get()); .get());
......
...@@ -105,7 +105,7 @@ struct TestBroadcastOpHandle { ...@@ -105,7 +105,7 @@ struct TestBroadcastOpHandle {
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
auto p = p::CUDAPlace(i); auto p = p::CUDAPlace(i);
place_list_.push_back(p); place_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p)); ctxs_.emplace_back(new phi::GPUContext(p));
} }
nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_)); nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
#else #else
......
...@@ -46,7 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( ...@@ -46,7 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
gc_(gc) { gc_(gc) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>( dev_ctx_ = reinterpret_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
if (dynamic_cast<StreamGarbageCollector *>(gc_)) { if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
platform::CUDADeviceGuard guard(place.device); platform::CUDADeviceGuard guard(place.device);
......
...@@ -81,7 +81,7 @@ class EagerDeletionOpHandle : public OpHandleBase { ...@@ -81,7 +81,7 @@ class EagerDeletionOpHandle : public OpHandleBase {
GarbageCollector *gc_; // not own GarbageCollector *gc_; // not own
std::vector<Variable *> vars_; std::vector<Variable *> vars_;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDADeviceContext *dev_ctx_{nullptr}; phi::GPUContext *dev_ctx_{nullptr};
gpuEvent_t event_{nullptr}; gpuEvent_t event_{nullptr};
#endif #endif
}; };
......
...@@ -58,7 +58,7 @@ struct TestGatherOpHandle { ...@@ -58,7 +58,7 @@ struct TestGatherOpHandle {
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
auto p = p::CUDAPlace(i); auto p = p::CUDAPlace(i);
gpu_list_.push_back(p); gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p)); ctxs_.emplace_back(new phi::GPUContext(p));
} }
#else #else
PADDLE_THROW( PADDLE_THROW(
......
...@@ -367,8 +367,7 @@ void CheckVarHasNanOrInf(const std::string& op_type, ...@@ -367,8 +367,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
if (platform::is_gpu_place(tensor->place())) { if (platform::is_gpu_place(tensor->place())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
tensor_check<platform::CUDADeviceContext>( tensor_check<phi::GPUContext>(op_type, var_name, *tensor, place);
op_type, var_name, *tensor, place);
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.", "Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.",
......
...@@ -135,7 +135,7 @@ __global__ void CheckNanInfKernel(const T* value, ...@@ -135,7 +135,7 @@ __global__ void CheckNanInfKernel(const T* value,
template <> template <>
template <typename T> template <typename T>
void TensorCheckerVisitor<platform::CUDADeviceContext>::apply( void TensorCheckerVisitor<phi::GPUContext>::apply(
typename std::enable_if< typename std::enable_if<
std::is_floating_point<T>::value || std::is_floating_point<T>::value ||
std::is_same<T, ::paddle::platform::complex<float>>::value || std::is_same<T, ::paddle::platform::complex<float>>::value ||
...@@ -143,7 +143,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply( ...@@ -143,7 +143,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
const { const {
int print_num = 3; int print_num = 3;
auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>( auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(tensor_.place())); platform::DeviceContextPool::Instance().Get(tensor_.place()));
int dev_id = tensor_.place().device; int dev_id = tensor_.place().device;
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -226,13 +226,13 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply( ...@@ -226,13 +226,13 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
} }
template <> template <>
void tensor_check<platform::CUDADeviceContext>(const std::string& op_type, void tensor_check<phi::GPUContext>(const std::string& op_type,
const std::string& var_name, const std::string& var_name,
const framework::Tensor& tensor, const framework::Tensor& tensor,
const platform::Place& place) { const platform::Place& place) {
std::call_once(init_multi_gpu_op_var_map_flag, InitMultiGPUOpVarMap); std::call_once(init_multi_gpu_op_var_map_flag, InitMultiGPUOpVarMap);
TensorCheckerVisitor<platform::CUDADeviceContext> vistor( TensorCheckerVisitor<phi::GPUContext> vistor(
op_type, var_name, tensor, place); op_type, var_name, tensor, place);
VisitDataType(framework::TransToProtoVarType(tensor.dtype()), vistor); VisitDataType(framework::TransToProtoVarType(tensor.dtype()), vistor);
} }
......
...@@ -184,8 +184,7 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { ...@@ -184,8 +184,7 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
dev_ctx.second->Wait(); dev_ctx.second->Wait();
} }
} else { } else {
auto stream = auto stream = static_cast<phi::GPUContext *>(waited_ctx)->stream();
static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
for (auto &ev : events_) { for (auto &ev : events_) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
...@@ -224,8 +223,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { ...@@ -224,8 +223,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto stream = auto stream =
static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place)) static_cast<phi::GPUContext *>(dev_ctxes_.at(place))->stream();
->stream();
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
...@@ -254,8 +252,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { ...@@ -254,8 +252,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
platform::DeviceContextPool &pool = platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
auto stream = auto stream =
static_cast<platform::CUDADeviceContext *>(pool.Get(place)) static_cast<phi::GPUContext *>(pool.Get(place))->stream();
->stream();
platform::GpuStreamSync(stream); platform::GpuStreamSync(stream);
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
...@@ -277,7 +274,7 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { ...@@ -277,7 +274,7 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
if (in_var_handle) { if (in_var_handle) {
if (platform::is_gpu_place(in_var_handle->place())) { if (platform::is_gpu_place(in_var_handle->place())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto stream = static_cast<platform::CUDADeviceContext *>( auto stream = static_cast<phi::GPUContext *>(
dev_ctxes_.at(in_var_handle->place())) dev_ctxes_.at(in_var_handle->place()))
->stream(); ->stream();
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -318,8 +315,8 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) { ...@@ -318,8 +315,8 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
if (!events_.empty()) { // Use event if (!events_.empty()) { // Use event
for (auto &p : dev_ctxes_) { for (auto &p : dev_ctxes_) {
auto dev_id = p.first.device; auto dev_id = p.first.device;
auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second); auto *cuda_dev_ctx = static_cast<phi::GPUContext *>(p.second);
VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id; VLOG(10) << "phi::GPUContext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
...@@ -339,7 +336,7 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p, ...@@ -339,7 +336,7 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p,
callback(); callback();
} else { } else {
auto *ctx = dev_ctxes_.at(p); auto *ctx = dev_ctxes_.at(p);
auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx); auto *cuda_ctx = static_cast<phi::GPUContext *>(ctx);
cuda_ctx->RecordEvent(events_.at(p.device), callback); cuda_ctx->RecordEvent(events_.at(p.device), callback);
} }
#else #else
......
...@@ -69,7 +69,7 @@ struct TestReduceOpHandle { ...@@ -69,7 +69,7 @@ struct TestReduceOpHandle {
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
auto p = p::CUDAPlace(i); auto p = p::CUDAPlace(i);
gpu_list_.push_back(p); gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p)); ctxs_.emplace_back(new p::phi::GPUContext(p));
} }
nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
#else #else
......
...@@ -77,7 +77,7 @@ struct ScaleLossGradFunctor { ...@@ -77,7 +77,7 @@ struct ScaleLossGradFunctor {
} else { } else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
OutT cast_coeff = static_cast<OutT>(coeff_); OutT cast_coeff = static_cast<OutT>(coeff_);
auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream(); auto stream = static_cast<phi::GPUContext *>(ctx_)->stream();
memory::Copy(place_, memory::Copy(place_,
out_data, out_data,
platform::CPUPlace(), platform::CPUPlace(),
......
...@@ -151,7 +151,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place, ...@@ -151,7 +151,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
const int hidden_size, const int hidden_size,
const int expand_embed_dim, const int expand_embed_dim,
const int64_t total_length) { const int64_t total_length) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>( auto stream = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place)) platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
...@@ -235,7 +235,7 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place, ...@@ -235,7 +235,7 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
const int64_t* gpu_len, const int64_t* gpu_len,
int slot_num, int slot_num,
int total_len) { int total_len) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>( auto stream = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place)) platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -265,7 +265,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place, ...@@ -265,7 +265,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
const int expand_embed_dim, const int expand_embed_dim,
const int64_t total_length, const int64_t total_length,
const int batch_size) { const int batch_size) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>( auto stream = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place)) platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
auto slot_lengths_lod = slot_lengths; auto slot_lengths_lod = slot_lengths;
......
...@@ -223,10 +223,10 @@ class AfsManager { ...@@ -223,10 +223,10 @@ class AfsManager {
delete read_stream; delete read_stream;
} }
int PopenBidirectionalInternal(const char* command, int PopenBidirectionalInternal(const char* command,
FILE*& fp_read, // NOLINT FILE*& fp_read, // NOLINT
FILE*& fp_write, FILE*& fp_write, // NOLINT
pid_t& pid, // NOLINT pid_t& pid, // NOLINT
bool read, // NOLINT bool read, // NOLINT
bool write) { bool write) {
std::lock_guard<std::mutex> g(g_flock); std::lock_guard<std::mutex> g(g_flock);
int fd_read[2]; int fd_read[2];
...@@ -440,10 +440,9 @@ class BoxWrapper { ...@@ -440,10 +440,9 @@ class BoxWrapper {
std::vector<gpuStream_t*> stream_list; std::vector<gpuStream_t*> stream_list;
for (int i = 0; i < platform::GetGPUDeviceCount(); ++i) { for (int i = 0; i < platform::GetGPUDeviceCount(); ++i) {
VLOG(3) << "before get context i[" << i << "]"; VLOG(3) << "before get context i[" << i << "]";
platform::CUDADeviceContext* context = phi::GPUContext* context = dynamic_cast<phi::GPUContext*>(
dynamic_cast<platform::CUDADeviceContext*>( platform::DeviceContextPool::Instance().Get(
platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(i)));
platform::CUDAPlace(i)));
stream_list_[i] = context->stream(); stream_list_[i] = context->stream();
stream_list.push_back(&stream_list_[i]); stream_list.push_back(&stream_list_[i]);
} }
......
...@@ -300,7 +300,7 @@ void AccessorWrapper<GPUAccessor>::CopyForPullImpl( ...@@ -300,7 +300,7 @@ void AccessorWrapper<GPUAccessor>::CopyForPullImpl(
const int64_t total_length, const int64_t total_length,
int* gpu_dim, int* gpu_dim,
int feature_value_size) { int feature_value_size) {
auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>( auto stream = dynamic_cast<phi::GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place)) paddle::platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
...@@ -333,7 +333,7 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl( ...@@ -333,7 +333,7 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
size_t grad_value_size, size_t grad_value_size,
std::vector<int>& slot_vector, std::vector<int>& slot_vector,
std::vector<int>& slot_mf_dim_vector) { std::vector<int>& slot_mf_dim_vector) {
auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>( auto stream = dynamic_cast<phi::GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place)) paddle::platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
auto slot_lengths_lod = slot_lengths; auto slot_lengths_lod = slot_lengths;
......
...@@ -90,7 +90,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, ...@@ -90,7 +90,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
const int64_t* gpu_len, const int64_t* gpu_len,
int slot_num, int slot_num,
int total_len) { int total_len) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>( auto stream = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place)) platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>( CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>(
......
...@@ -78,14 +78,12 @@ DefaultStreamGarbageCollector::DefaultStreamGarbageCollector( ...@@ -78,14 +78,12 @@ DefaultStreamGarbageCollector::DefaultStreamGarbageCollector(
: GarbageCollector(place, max_memory_size) {} : GarbageCollector(place, max_memory_size) {}
void DefaultStreamGarbageCollector::Wait() const { void DefaultStreamGarbageCollector::Wait() const {
static_cast<platform::CUDADeviceContext *>(this->dev_ctx_) static_cast<phi::GPUContext *>(this->dev_ctx_)->WaitStreamCallback();
->WaitStreamCallback();
} }
void DefaultStreamGarbageCollector::ClearCallback( void DefaultStreamGarbageCollector::ClearCallback(
const std::function<void()> &callback) { const std::function<void()> &callback) {
static_cast<platform::CUDADeviceContext *>(this->dev_ctx_) static_cast<phi::GPUContext *>(this->dev_ctx_)->AddStreamCallback(callback);
->AddStreamCallback(callback);
} }
StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
......
...@@ -48,8 +48,7 @@ void SetMicroId(paddle::framework::Scope* scope, ...@@ -48,8 +48,7 @@ void SetMicroId(paddle::framework::Scope* scope,
char* temp_ptr = temp.data(); char* temp_ptr = temp.data();
float* temp_ptr_float = reinterpret_cast<float*>(temp_ptr); float* temp_ptr_float = reinterpret_cast<float*>(temp_ptr);
temp_ptr_float[0] = micro_id; temp_ptr_float[0] = micro_id;
auto stream = auto stream = reinterpret_cast<const phi::GPUContext&>(*dev_ctx).stream();
reinterpret_cast<const platform::CUDADeviceContext&>(*dev_ctx).stream();
memory::Copy( memory::Copy(
place, place,
tensor_data, tensor_data,
......
...@@ -514,7 +514,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, ...@@ -514,7 +514,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
} }
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto* dev_ctx = static_cast<platform::CUDADeviceContext*>( auto* dev_ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaEventRecord(context->event_, dev_ctx->stream())); cudaEventRecord(context->event_, dev_ctx->stream()));
......
...@@ -229,7 +229,7 @@ void TestMainImpl(std::string func_name, ...@@ -229,7 +229,7 @@ void TestMainImpl(std::string func_name,
device_code.SetWorkloadPerThread(1); device_code.SetWorkloadPerThread(1);
device_code.Launch(n, &args); device_code.Launch(n, &args);
auto* dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext*>( auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::platform::DeviceContextPool::Instance().Get(place));
dev_ctx->Wait(); dev_ctx->Wait();
......
...@@ -38,7 +38,7 @@ void CopyToCPUHelper(std::vector<T> *cpu_, ...@@ -38,7 +38,7 @@ void CopyToCPUHelper(std::vector<T> *cpu_,
size_t *gpu_memory_size_) { size_t *gpu_memory_size_) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// COPY GPU Data To CPU // COPY GPU Data To CPU
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>( auto *dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get((*gpu_)->place())); platform::DeviceContextPool::Instance().Get((*gpu_)->place()));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
void *src = (*gpu_)->ptr(); void *src = (*gpu_)->ptr();
...@@ -63,7 +63,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_, ...@@ -63,7 +63,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
*gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T) *gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T)
(*gpu_) = memory::Alloc(place, *gpu_memory_size_); (*gpu_) = memory::Alloc(place, *gpu_memory_size_);
void *dst = (*gpu_)->ptr(); void *dst = (*gpu_)->ptr();
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>( auto *dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(), paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(),
......
...@@ -38,7 +38,7 @@ static __global__ void multiply_10(int* ptr) { ...@@ -38,7 +38,7 @@ static __global__ void multiply_10(int* ptr) {
} }
gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
return reinterpret_cast<const paddle::platform::CUDADeviceContext*>( return reinterpret_cast<const phi::GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place)) paddle::platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
} }
......
...@@ -854,9 +854,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) { ...@@ -854,9 +854,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
platform::RecordEvent record( platform::RecordEvent record(
"RecordStreamForGC", platform::TracerEventType::UserDefined, 10); "RecordStreamForGC", platform::TracerEventType::UserDefined, 10);
gpuStream_t stream = reinterpret_cast<const platform::CUDADeviceContext&>( gpuStream_t stream =
instr.DeviceContext()) reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext()).stream();
.stream();
auto TensorRecordStream = [&stream](Tensor& tensor) { auto TensorRecordStream = [&stream](Tensor& tensor) {
auto allocation = tensor.Holder(); auto allocation = tensor.Holder();
if (allocation == nullptr) { if (allocation == nullptr) {
......
...@@ -236,9 +236,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel, ...@@ -236,9 +236,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
paddle::framework::OpKernelTest<phi::CPUContext, float>); paddle::framework::OpKernelTest<phi::CPUContext, float>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
op_with_kernel, op_with_kernel, paddle::framework::OpKernelTest<phi::GPUContext, float>);
paddle::framework::OpKernelTest<paddle::platform::CUDADeviceContext,
float>);
TEST(OperatorRegistrar, CPU) { TEST(OperatorRegistrar, CPU) {
paddle::framework::proto::OpDesc op_desc; paddle::framework::proto::OpDesc op_desc;
...@@ -263,9 +261,9 @@ TEST(OperatorRegistrar, CUDA) { ...@@ -263,9 +261,9 @@ TEST(OperatorRegistrar, CUDA) {
} }
static int op_test_value = 0; static int op_test_value = 0;
using paddle::platform::CUDADeviceContext;
using paddle::platform::DeviceContext; using paddle::platform::DeviceContext;
using phi::CPUContext; using phi::CPUContext;
using phi::GPUContext;
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -301,7 +299,7 @@ class OpMultiKernelTest<CPUContext, T> : public paddle::framework::OpKernel<T> { ...@@ -301,7 +299,7 @@ class OpMultiKernelTest<CPUContext, T> : public paddle::framework::OpKernel<T> {
}; };
template <typename T> template <typename T>
class OpMultiKernelTest<CUDADeviceContext, T> class OpMultiKernelTest<phi::GPUContext, T>
: public paddle::framework::OpKernel<T> { : public paddle::framework::OpKernel<T> {
public: public:
void Compute(const paddle::framework::ExecutionContext& ctx) const { void Compute(const paddle::framework::ExecutionContext& ctx) const {
...@@ -325,7 +323,7 @@ class OpMultiKernelTest2<CPUContext, T> ...@@ -325,7 +323,7 @@ class OpMultiKernelTest2<CPUContext, T>
}; };
template <typename T> template <typename T>
class OpMultiKernelTest2<CUDADeviceContext, T> class OpMultiKernelTest2<phi::GPUContext, T>
: public paddle::framework::OpKernel<T> { : public paddle::framework::OpKernel<T> {
public: public:
void Compute(const paddle::framework::ExecutionContext& ctx) const { void Compute(const paddle::framework::ExecutionContext& ctx) const {
...@@ -351,12 +349,12 @@ REGISTER_OP_KERNEL( ...@@ -351,12 +349,12 @@ REGISTER_OP_KERNEL(
op_with_multi_kernel, op_with_multi_kernel,
CUDA, CUDA,
paddle::platform::CUDAPlace, paddle::platform::CUDAPlace,
paddle::framework::OpMultiKernelTest<CUDADeviceContext, float>); paddle::framework::OpMultiKernelTest<phi::GPUContext, float>);
REGISTER_OP_KERNEL( REGISTER_OP_KERNEL(
op_with_multi_kernel, op_with_multi_kernel,
CUDNN, CUDNN,
paddle::platform::CUDAPlace, paddle::platform::CUDAPlace,
paddle::framework::OpMultiKernelTest2<CUDADeviceContext, float>); paddle::framework::OpMultiKernelTest2<phi::GPUContext, float>);
TEST(OperatorRegistrar, OpWithMultiKernel) { TEST(OperatorRegistrar, OpWithMultiKernel) {
paddle::framework::proto::OpDesc op_desc; paddle::framework::proto::OpDesc op_desc;
......
...@@ -416,13 +416,12 @@ class ExecutionContext { ...@@ -416,13 +416,12 @@ class ExecutionContext {
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const inline platform::CUDADeviceContext& cuda_device_context() const { const inline phi::GPUContext& cuda_device_context() const {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()), PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()),
true, true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Current device context place is not GPUPlace.")); "Current device context place is not GPUPlace."));
return *reinterpret_cast<const platform::CUDADeviceContext*>( return *reinterpret_cast<const phi::GPUContext*>(&device_context_);
&device_context_);
} }
#endif #endif
......
...@@ -863,12 +863,12 @@ void ParallelExecutor::BCastParamsToDevices( ...@@ -863,12 +863,12 @@ void ParallelExecutor::BCastParamsToDevices(
nccl_ctxs->WaitAll(); nccl_ctxs->WaitAll();
} else { } else {
auto src_place = member_->places_[0]; auto src_place = member_->places_[0];
auto src_dev_ctx = static_cast<platform::CUDADeviceContext *>( auto src_dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(src_place)); platform::DeviceContextPool::Instance().Get(src_place));
auto sizeof_dtype = framework::SizeOfType(dtype) * numel; auto sizeof_dtype = framework::SizeOfType(dtype) * numel;
for (size_t i = 1; i < member_->places_.size(); ++i) { for (size_t i = 1; i < member_->places_.size(); ++i) {
auto dst_place = member_->places_[i]; auto dst_place = member_->places_[i];
auto dst_dev_ctx = static_cast<platform::CUDADeviceContext *>( auto dst_dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(dst_place)); platform::DeviceContextPool::Instance().Get(dst_place));
src_dev_ctx->Wait(); src_dev_ctx->Wait();
dst_dev_ctx->Wait(); dst_dev_ctx->Wait();
...@@ -1492,8 +1492,8 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) { ...@@ -1492,8 +1492,8 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
global_scope, member_->places_); global_scope, member_->places_);
auto &pool = platform::DeviceContextPool::Instance(); auto &pool = platform::DeviceContextPool::Instance();
for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>( auto *dev_ctx =
pool.Get(member_->places_[dev_id])); static_cast<phi::GPUContext *>(pool.Get(member_->places_[dev_id]));
auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]); auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]);
dev_ctx->set_nccl_comm(nccl_ctx.comm()); dev_ctx->set_nccl_comm(nccl_ctx.comm());
} }
......
...@@ -72,7 +72,7 @@ struct ConvertToPhiContext<phi::CPUContext> { ...@@ -72,7 +72,7 @@ struct ConvertToPhiContext<phi::CPUContext> {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <> template <>
struct ConvertToPhiContext<platform::CUDADeviceContext> { struct ConvertToPhiContext<phi::GPUContext> {
using TYPE = phi::GPUContext; using TYPE = phi::GPUContext;
}; };
#endif #endif
......
...@@ -261,8 +261,7 @@ void TensorCopyImpl(const TENSOR& src, ...@@ -261,8 +261,7 @@ void TensorCopyImpl(const TENSOR& src,
"place is %s, context place is %s.", "place is %s, context place is %s.",
src_gpu_place, src_gpu_place,
ctx_gpu_place)); ctx_gpu_place));
auto stream = auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} }
else if (platform::is_cpu_place(src_place) && // NOLINT else if (platform::is_cpu_place(src_place) && // NOLINT
...@@ -284,8 +283,7 @@ void TensorCopyImpl(const TENSOR& src, ...@@ -284,8 +283,7 @@ void TensorCopyImpl(const TENSOR& src,
"destination place is %s, context place is %s.", "destination place is %s, context place is %s.",
dst_gpu_place, dst_gpu_place,
ctx_gpu_place)); ctx_gpu_place));
auto stream = auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
} }
else if (platform::is_gpu_place(src_place) && // NOLINT else if (platform::is_gpu_place(src_place) && // NOLINT
...@@ -308,8 +306,7 @@ void TensorCopyImpl(const TENSOR& src, ...@@ -308,8 +306,7 @@ void TensorCopyImpl(const TENSOR& src,
"device context GPU number is %d.", "device context GPU number is %d.",
src_gpu_place.device, src_gpu_place.device,
ctx_gpu_place.device)); ctx_gpu_place.device));
auto stream = auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy( memory::Copy(
dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream); dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} }
...@@ -333,8 +330,7 @@ void TensorCopyImpl(const TENSOR& src, ...@@ -333,8 +330,7 @@ void TensorCopyImpl(const TENSOR& src,
"device context GPU number is %d.", "device context GPU number is %d.",
dst_gpu_place.device, dst_gpu_place.device,
ctx_gpu_place.device)); ctx_gpu_place.device));
auto stream = auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy( memory::Copy(
dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream); dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
} }
...@@ -349,8 +345,7 @@ void TensorCopyImpl(const TENSOR& src, ...@@ -349,8 +345,7 @@ void TensorCopyImpl(const TENSOR& src,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Context place error, excepted GPUPlace, but actually %s.", "Context place error, excepted GPUPlace, but actually %s.",
ctx_place)); ctx_place));
auto stream = auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
if (platform::is_same_place(src_place, dst_place)) { if (platform::is_same_place(src_place, dst_place)) {
memory::Copy( memory::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
...@@ -1076,8 +1071,7 @@ void TensorToStream(std::ostream& os, ...@@ -1076,8 +1071,7 @@ void TensorToStream(std::ostream& os,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]); std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& gpu_dev_ctx = auto& gpu_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
static_cast<const platform::CUDADeviceContext&>(dev_ctx);
platform::CPUPlace cpu; platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr); uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) { while (size != 0) {
...@@ -1482,13 +1476,12 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) { ...@@ -1482,13 +1476,12 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
platform::CUDAPlace(dl_tensor.device.device_id); platform::CUDAPlace(dl_tensor.device.device_id);
dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place); dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place); auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place);
memory::Copy( memory::Copy(dst_place,
dst_place, dst_ptr,
dst_ptr, src_place,
src_place, src_ptr,
src_ptr, size,
size, reinterpret_cast<const phi::GPUContext&>(*ctx).stream());
reinterpret_cast<const platform::CUDADeviceContext&>(*ctx).stream());
} }
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
......
...@@ -164,13 +164,12 @@ void TensorFromArray(const T* src, ...@@ -164,13 +164,12 @@ void TensorFromArray(const T* src,
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(dst_place)) { // NOLINT else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy( memory::Copy(dst_place,
dst_place, dst_ptr,
dst_ptr, src_place,
src_place, src_ptr,
src_ptr, size,
size, reinterpret_cast<const phi::GPUContext&>(ctx).stream());
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
...@@ -242,13 +241,12 @@ void TensorFromVector(const std::vector<T>& src, ...@@ -242,13 +241,12 @@ void TensorFromVector(const std::vector<T>& src,
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(dst_place)) { // NOLINT else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy( memory::Copy(dst_place,
dst_place, dst_ptr,
dst_ptr, src_place,
src_place, src_ptr,
src_ptr, size,
size, reinterpret_cast<const phi::GPUContext&>(ctx).stream());
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
...@@ -340,13 +338,12 @@ inline void TensorFromVector(const std::vector<bool>& src, ...@@ -340,13 +338,12 @@ inline void TensorFromVector(const std::vector<bool>& src,
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(dst_place)) { // NOLINT else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy( memory::Copy(dst_place,
dst_place, dst_ptr,
dst_ptr, src_place,
src_place, src_ptr,
src_ptr, size,
size, reinterpret_cast<const phi::GPUContext&>(ctx).stream());
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
...@@ -444,13 +441,12 @@ void TensorToVector(const Tensor& src, ...@@ -444,13 +441,12 @@ void TensorToVector(const Tensor& src,
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(src.place())) { // NOLINT else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy( memory::Copy(dst_place,
dst_place, dst_ptr,
dst_ptr, src.place(),
src.place(), src_ptr,
src_ptr, size,
size, reinterpret_cast<const phi::GPUContext&>(ctx).stream());
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#endif #endif
#if defined(PADDLE_WITH_XPU) #if defined(PADDLE_WITH_XPU)
...@@ -503,13 +499,12 @@ inline void TensorToVector(const Tensor& src, ...@@ -503,13 +499,12 @@ inline void TensorToVector(const Tensor& src,
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(src.place())) { // NOLINT else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy( memory::Copy(dst_place,
dst_place, dst_ptr,
dst_ptr, src.place(),
src.place(), src_ptr,
src_ptr, size,
size, reinterpret_cast<const phi::GPUContext&>(ctx).stream());
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#endif #endif
#if defined(PADDLE_WITH_XPU) #if defined(PADDLE_WITH_XPU)
......
...@@ -73,7 +73,7 @@ TEST(TensorCopy, Tensor) { ...@@ -73,7 +73,7 @@ TEST(TensorCopy, Tensor) {
// CPU Tensor to GPU Tensor // CPU Tensor to GPU Tensor
auto gpu_place = new platform::CUDAPlace(0); auto gpu_place = new platform::CUDAPlace(0);
platform::CUDADeviceContext gpu_ctx(*gpu_place); phi::GPUContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream()) .GetAllocator(*gpu_place, gpu_ctx.stream())
.get()); .get());
...@@ -170,7 +170,7 @@ TEST(TensorFromVector, Tensor) { ...@@ -170,7 +170,7 @@ TEST(TensorFromVector, Tensor) {
// Copy to GPUTensor // Copy to GPUTensor
gpu_tensor.Resize(phi::make_ddim({3, 3})); gpu_tensor.Resize(phi::make_ddim({3, 3}));
auto gpu_place = new paddle::platform::CUDAPlace(); auto gpu_place = new paddle::platform::CUDAPlace();
paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place); phi::GPUContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream()) .GetAllocator(*gpu_place, gpu_ctx.stream())
.get()); .get());
...@@ -238,7 +238,7 @@ TEST(TensorToVector, Tensor) { ...@@ -238,7 +238,7 @@ TEST(TensorToVector, Tensor) {
std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
paddle::framework::Tensor gpu_tensor; paddle::framework::Tensor gpu_tensor;
paddle::platform::CUDAPlace place; paddle::platform::CUDAPlace place;
paddle::platform::CUDADeviceContext gpu_ctx(place); phi::GPUContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream()) .GetAllocator(place, gpu_ctx.stream())
.get()); .get());
...@@ -255,22 +255,20 @@ TEST(TensorToVector, Tensor) { ...@@ -255,22 +255,20 @@ TEST(TensorToVector, Tensor) {
#endif #endif
} }
TEST(TensorToVector, Tensor_bool) { TEST(TensorToVector, Tensor_bool){{paddle::framework::Tensor src;
{ bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
paddle::framework::Tensor src; for (int i = 0; i < 3 * 3; ++i) {
bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace()); src_ptr[i] = static_cast<bool>(i % 2);
for (int i = 0; i < 3 * 3; ++i) { }
src_ptr[i] = static_cast<bool>(i % 2);
}
paddle::platform::CPUPlace place; paddle::platform::CPUPlace place;
std::vector<bool> dst; std::vector<bool> dst;
paddle::framework::TensorToVector<bool>(src, &dst); paddle::framework::TensorToVector<bool>(src, &dst);
for (int i = 0; i < 3 * 3; ++i) { for (int i = 0; i < 3 * 3; ++i) {
EXPECT_EQ(src_ptr[i], dst[i]); EXPECT_EQ(src_ptr[i], dst[i]);
}
} }
} // namespace framework
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
{ {
...@@ -287,7 +285,7 @@ TEST(TensorToVector, Tensor_bool) { ...@@ -287,7 +285,7 @@ TEST(TensorToVector, Tensor_bool) {
}; };
paddle::framework::Tensor gpu_tensor; paddle::framework::Tensor gpu_tensor;
paddle::platform::CUDAPlace place; paddle::platform::CUDAPlace place;
paddle::platform::CUDADeviceContext gpu_ctx(place); phi::GPUContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream()) .GetAllocator(place, gpu_ctx.stream())
.get()); .get());
...@@ -328,7 +326,7 @@ TEST(TensorToVector, Tensor_bool) { ...@@ -328,7 +326,7 @@ TEST(TensorToVector, Tensor_bool) {
} }
} }
#endif #endif
} } // namespace paddle
TEST(TensorFromDLPack, Tensor) { TEST(TensorFromDLPack, Tensor) {
{ {
...@@ -525,7 +523,7 @@ TEST(Tensor, FromAndToStream) { ...@@ -525,7 +523,7 @@ TEST(Tensor, FromAndToStream) {
Tensor dst_tensor; Tensor dst_tensor;
auto gpu_place = new platform::CUDAPlace(); auto gpu_place = new platform::CUDAPlace();
platform::CUDADeviceContext gpu_ctx(*gpu_place); phi::GPUContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream()) .GetAllocator(*gpu_place, gpu_ctx.stream())
.get()); .get());
......
...@@ -95,7 +95,7 @@ static void AllReduce(const phi::SelectedRows &src, ...@@ -95,7 +95,7 @@ static void AllReduce(const phi::SelectedRows &src,
auto dtype = framework::TransToProtoVarType(src_tensor.dtype()); auto dtype = framework::TransToProtoVarType(src_tensor.dtype());
auto nccl_dtype = platform::ToNCCLDataType(dtype); auto nccl_dtype = platform::ToNCCLDataType(dtype);
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>( auto *dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
bool use_calc_stream = (dev_ctx->stream() == stream); bool use_calc_stream = (dev_ctx->stream() == stream);
...@@ -220,7 +220,7 @@ void AllReduce(const framework::Variable &src, ...@@ -220,7 +220,7 @@ void AllReduce(const framework::Variable &src,
int ring_id, int ring_id,
bool use_calc_stream) { bool use_calc_stream) {
const auto &place = GetVarPlace(src); const auto &place = GetVarPlace(src);
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>( auto *dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
platform::NCCLComm *comm = platform::NCCLComm *comm =
platform::NCCLCommContext::Instance().Get(ring_id, place); platform::NCCLCommContext::Instance().Get(ring_id, place);
......
...@@ -122,10 +122,9 @@ class TensorAddFunctor ...@@ -122,10 +122,9 @@ class TensorAddFunctor
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void operator()(const platform::CUDAPlace& place) const { void operator()(const platform::CUDAPlace& place) const {
platform::CUDADeviceContext* ctx = phi::GPUContext* ctx = dynamic_cast<phi::GPUContext*>(
dynamic_cast<platform::CUDADeviceContext*>( platform::DeviceContextPool::Instance().Get(place));
platform::DeviceContextPool::Instance().Get(place)); auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(*ctx);
auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_); blas.AXPY(numel_, 1., x_, y_);
} }
#else #else
...@@ -433,7 +432,7 @@ void TensorAdd(const VarType& src, VarType* dst) { ...@@ -433,7 +432,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
if (data_type == framework::proto::VarType::FP16) { if (data_type == framework::proto::VarType::FP16) {
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return TensorAddImpl<platform::CUDADeviceContext, platform::float16>( return TensorAddImpl<phi::GPUContext, platform::float16>(
src_tensor, dst_tensor, place); src_tensor, dst_tensor, place);
#else #else
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
...@@ -450,7 +449,7 @@ void TensorAdd(const VarType& src, VarType* dst) { ...@@ -450,7 +449,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
if (data_type == framework::proto::VarType::BF16) { if (data_type == framework::proto::VarType::BF16) {
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return TensorAddImpl<platform::CUDADeviceContext, platform::bfloat16>( return TensorAddImpl<phi::GPUContext, platform::bfloat16>(
src_tensor, dst_tensor, place); src_tensor, dst_tensor, place);
#else #else
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
...@@ -499,8 +498,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) { ...@@ -499,8 +498,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (paddle::platform::is_gpu_place(place)) { if (paddle::platform::is_gpu_place(place)) {
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, float); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float);
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, double); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double);
} else { } else {
#endif #endif
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float);
...@@ -551,8 +550,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, ...@@ -551,8 +550,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, float); PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float);
PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, double); PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double);
} else { } else {
#endif #endif
PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float); PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float);
...@@ -614,8 +613,8 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1, ...@@ -614,8 +613,8 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (paddle::platform::is_gpu_place(place)) { if (paddle::platform::is_gpu_place(place)) {
PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, float); PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float);
PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, double); PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double);
} else { } else {
#endif #endif
PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, float); PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, float);
......
...@@ -85,7 +85,7 @@ void NCCLParallelContext::Init() { ...@@ -85,7 +85,7 @@ void NCCLParallelContext::Init() {
VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
<< " ring id: " << ring_id; << " ring id: " << ring_id;
// it will assign nccl_comm in CUDADeviceContext within ring_id // it will assign nccl_comm in phi::GPUContext within ring_id
platform::NCCLCommContext::Instance().CreateComm(&nccl_ids[ring_id], platform::NCCLCommContext::Instance().CreateComm(&nccl_ids[ring_id],
strategy_.nranks_, strategy_.nranks_,
strategy_.local_rank_, strategy_.local_rank_,
...@@ -119,7 +119,7 @@ void NCCLParallelContext::InitWithRingID(int ring_id) { ...@@ -119,7 +119,7 @@ void NCCLParallelContext::InitWithRingID(int ring_id) {
VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
<< " ring id: " << ring_id; << " ring id: " << ring_id;
// it will assign nccl_comm in CUDADeviceContext within ring_id // it will assign nccl_comm in phi::GPUContext within ring_id
platform::NCCLCommContext::Instance().CreateComm( platform::NCCLCommContext::Instance().CreateComm(
&nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id); &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id);
...@@ -177,7 +177,7 @@ void NCCLParallelContext::WaitCompute(int ring_id) { ...@@ -177,7 +177,7 @@ void NCCLParallelContext::WaitCompute(int ring_id) {
ring_id, ring_id,
compute_events_.size())); compute_events_.size()));
auto compute_stream = static_cast<platform::CUDADeviceContext *>( auto compute_stream = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place_)) platform::DeviceContextPool::Instance().Get(place_))
->stream(); ->stream();
auto comm_stream = auto comm_stream =
...@@ -207,7 +207,7 @@ void NCCLParallelContext::WaitComm(int ring_id) { ...@@ -207,7 +207,7 @@ void NCCLParallelContext::WaitComm(int ring_id) {
ring_id, ring_id,
comm_events_.size())); comm_events_.size()));
auto compute_stream = static_cast<platform::CUDADeviceContext *>( auto compute_stream = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place_)) platform::DeviceContextPool::Instance().Get(place_))
->stream(); ->stream();
auto comm_stream = auto comm_stream =
...@@ -225,7 +225,7 @@ void NCCLParallelContext::WaitComm(int ring_id) { ...@@ -225,7 +225,7 @@ void NCCLParallelContext::WaitComm(int ring_id) {
} }
void NCCLParallelContext::SynchronizeCompute() { void NCCLParallelContext::SynchronizeCompute() {
auto *compute_dev_ctx = static_cast<platform::CUDADeviceContext *>( auto *compute_dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place_)); platform::DeviceContextPool::Instance().Get(place_));
compute_dev_ctx->Wait(); compute_dev_ctx->Wait();
} }
......
...@@ -283,11 +283,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) { ...@@ -283,11 +283,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
auto place = context.GetPlace(); auto place = context.GetPlace();
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
ConcatTensorsWithType( ConcatTensorsWithType(static_cast<const phi::GPUContext &>(context),
static_cast<const platform::CUDADeviceContext &>(context), dense_tensors_,
dense_tensors_, &dense_contents_,
&dense_contents_, dtype_);
dtype_);
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't concat grad tensors since it's not compiled with NCCL," "Paddle can't concat grad tensors since it's not compiled with NCCL,"
...@@ -344,11 +343,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) { ...@@ -344,11 +343,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
auto place = context.GetPlace(); auto place = context.GetPlace();
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
SplitTensorsWithType( SplitTensorsWithType(static_cast<const phi::GPUContext &>(context),
static_cast<const platform::CUDADeviceContext &>(context), &dense_contents_,
&dense_contents_, &dense_tensors_,
&dense_tensors_, dtype_);
dtype_);
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't split grad tensor since it's not compiled with NCCL," "Paddle can't split grad tensor since it's not compiled with NCCL,"
......
...@@ -27,13 +27,10 @@ void Group::DivNRanks(framework::Tensor *tensor, ...@@ -27,13 +27,10 @@ void Group::DivNRanks(framework::Tensor *tensor,
"Unsupport BF16 in DataParallel for now")); "Unsupport BF16 in DataParallel for now"));
} }
framework::VisitDataTypeForHIP( framework::VisitDataTypeForHIP(
dtype_, dtype_, DivNRanksForAllReduce<phi::GPUContext>(tensor, nranks, context));
DivNRanksForAllReduce<platform::CUDADeviceContext>(
tensor, nranks, context));
#else #else
framework::VisitDataType(dtype_, framework::VisitDataType(
DivNRanksForAllReduce<platform::CUDADeviceContext>( dtype_, DivNRanksForAllReduce<phi::GPUContext>(tensor, nranks, context));
tensor, nranks, context));
#endif #endif
} }
#endif #endif
......
...@@ -39,7 +39,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) { ...@@ -39,7 +39,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) {
void AllReduceByStream(int local_rank, int device_id) { void AllReduceByStream(int local_rank, int device_id) {
int data_size = 32; int data_size = 32;
const auto& place = platform::CUDAPlace(device_id); const auto& place = platform::CUDAPlace(device_id);
platform::CUDADeviceContext ctx(place); phi::GPUContext ctx(place);
// heter_parallel_ctx // heter_parallel_ctx
imperative::HeterParallelContext hpc(GetStrategy(local_rank), device_id); imperative::HeterParallelContext hpc(GetStrategy(local_rank), device_id);
......
...@@ -78,7 +78,7 @@ void Broadcast(int local_rank, int device_id) { ...@@ -78,7 +78,7 @@ void Broadcast(int local_rank, int device_id) {
int data_size = 4; int data_size = 4;
float test_data = 7; float test_data = 7;
const auto& place = platform::CUDAPlace(device_id); const auto& place = platform::CUDAPlace(device_id);
platform::CUDADeviceContext ctx(place); phi::GPUContext ctx(place);
imperative::NCCLParallelContext npc(GetStrategy(local_rank), place); imperative::NCCLParallelContext npc(GetStrategy(local_rank), place);
......
...@@ -194,8 +194,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, ...@@ -194,8 +194,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt,
"Only one choice can be made between CPU and XPU.")); "Only one choice can be made between CPU and XPU."));
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place));
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
auto dst_gpu_place = place; auto dst_gpu_place = place;
memory::Copy(dst_gpu_place, memory::Copy(dst_gpu_place,
static_cast<void *>(input_ptr), static_cast<void *>(input_ptr),
...@@ -283,7 +282,7 @@ bool AnalysisPredictor::Init( ...@@ -283,7 +282,7 @@ bool AnalysisPredictor::Init(
// NOTE: If the external_stream equals to global_device_contexts's stream, // NOTE: If the external_stream equals to global_device_contexts's stream,
// then fallback. // then fallback.
auto global_stream = auto global_stream =
static_cast<platform::CUDADeviceContext *>( static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place_)) platform::DeviceContextPool::Instance().Get(place_))
->stream(); ->stream();
if (predictor_stream_ != global_stream) { if (predictor_stream_ != global_stream) {
...@@ -1658,8 +1657,7 @@ void AnalysisPredictor::CollectShapeRangeInfo() { ...@@ -1658,8 +1657,7 @@ void AnalysisPredictor::CollectShapeRangeInfo() {
paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
auto gpu_place = place_; auto gpu_place = place_;
auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>( auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(gpu_place));
pool.Get(gpu_place));
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
hipStreamSynchronize(dev_ctx->stream()); hipStreamSynchronize(dev_ctx->stream());
#else #else
...@@ -2331,8 +2329,7 @@ void InternalUtils::SyncStream(paddle_infer::Predictor *p) { ...@@ -2331,8 +2329,7 @@ void InternalUtils::SyncStream(paddle_infer::Predictor *p) {
auto *pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get()); auto *pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
auto *dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext *>( auto *dev_ctx = reinterpret_cast<phi::GPUContext *>(pool.Get(pred->place_));
pool.Get(pred->place_));
cudaStreamSynchronize(dev_ctx->stream()); cudaStreamSynchronize(dev_ctx->stream());
#endif #endif
} }
......
...@@ -248,8 +248,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -248,8 +248,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::DeviceContextPool &pool = platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
auto *dev_ctx = auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place_));
static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
auto dst_gpu_place = place_; auto dst_gpu_place = place_;
memory::Copy(dst_gpu_place, memory::Copy(dst_gpu_place,
static_cast<void *>(input_ptr), static_cast<void *>(input_ptr),
......
...@@ -158,8 +158,7 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst, ...@@ -158,8 +158,7 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
paddle::platform::CUDAPlace gpu_place(dst.device_); paddle::platform::CUDAPlace gpu_place(dst.device_);
auto* dev_ctx = static_cast<const paddle::platform::CUDADeviceContext*>( auto* dev_ctx = static_cast<const phi::GPUContext*>(pool.Get(gpu_place));
pool.Get(gpu_place));
if (src.place() == PlaceType::kCPU) { if (src.place() == PlaceType::kCPU) {
paddle::memory::Copy(gpu_place, paddle::memory::Copy(gpu_place,
......
...@@ -139,13 +139,12 @@ void MemoryCopyAsync(const platform::Place& dst_place, ...@@ -139,13 +139,12 @@ void MemoryCopyAsync(const platform::Place& dst_place,
} else if (platform::is_gpu_place(dst_place) && } else if (platform::is_gpu_place(dst_place) &&
platform::is_gpu_place(src_place)) { platform::is_gpu_place(src_place)) {
auto gpu_place = src_place; auto gpu_place = src_place;
memory::Copy( memory::Copy(gpu_place,
gpu_place, dst_data,
dst_data, gpu_place,
gpu_place, src_data,
src_data, size,
size, static_cast<const phi::GPUContext&>(ctx).stream());
static_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
......
...@@ -74,7 +74,7 @@ void make_fake_model(std::string* model, std::string* param) { ...@@ -74,7 +74,7 @@ void make_fake_model(std::string* model, std::string* param) {
framework::Scope scope; framework::Scope scope;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); phi::GPUContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream()) .GetAllocator(place, ctx.stream())
.get()); .get());
......
...@@ -118,8 +118,7 @@ void test_tensor_copy(const platform::DeviceContext& ctx) { ...@@ -118,8 +118,7 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx); TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
platform::GpuStreamSync( platform::GpuStreamSync(static_cast<const phi::GPUContext&>(ctx).stream());
static_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#endif #endif
std::vector<float> result; std::vector<float> result;
......
...@@ -68,7 +68,7 @@ TEST(EngineIOConverterTester, DefaultCPU) { ...@@ -68,7 +68,7 @@ TEST(EngineIOConverterTester, DefaultCPU) {
TEST(EngineIOConverterTester, DefaultGPU) { TEST(EngineIOConverterTester, DefaultGPU) {
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); phi::GPUContext ctx(place);
IOConverterTester(ctx); IOConverterTester(ctx);
} }
......
...@@ -124,7 +124,7 @@ class TRTConvertValidation { ...@@ -124,7 +124,7 @@ class TRTConvertValidation {
} }
void DeclVar(const std::string& name, const std::vector<int> dim_vec) { void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
platform::CUDADeviceContext ctx(place_); phi::GPUContext ctx(place_);
auto* x = scope_.Var(name); auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>(); auto* x_tensor = x->GetMutable<framework::LoDTensor>();
...@@ -172,7 +172,7 @@ class TRTConvertValidation { ...@@ -172,7 +172,7 @@ class TRTConvertValidation {
"But received batch_size:%d, max_batch_size_:%d", "But received batch_size:%d, max_batch_size_:%d",
batch_size, batch_size,
max_batch_size_)); max_batch_size_));
platform::CUDADeviceContext ctx(place_); phi::GPUContext ctx(place_);
op_->Run(scope_, place_); op_->Run(scope_, place_);
cudaStreamSynchronize(stream_); cudaStreamSynchronize(stream_);
std::vector<std::string> input_output_names; std::vector<std::string> input_output_names;
......
...@@ -347,11 +347,11 @@ int QkvToContextPluginDynamic::enqueue( ...@@ -347,11 +347,11 @@ int QkvToContextPluginDynamic::enqueue(
TransposeQKV( TransposeQKV(
batch, seq_len, head_size_, head_number_, input0_data, tptr, stream); batch, seq_len, head_size_, head_number_, input0_data, tptr, stream);
auto *device_ctx = static_cast<platform::CUDADeviceContext *>( auto *device_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get( platform::DeviceContextPool::Instance().Get(
platform::CUDAPlace(device_id))); platform::CUDAPlace(device_id)));
const platform::CUDADeviceContext &dev_ctx = *device_ctx; const phi::GPUContext &dev_ctx = *device_ctx;
operators::math::MultiHeadGPUComputeFunctor<float> multihead_compute_func; operators::math::MultiHeadGPUComputeFunctor<float> multihead_compute_func;
multihead_compute_func(dev_ctx, multihead_compute_func(dev_ctx,
batch, batch,
...@@ -403,7 +403,7 @@ int QkvToContextPluginDynamic::enqueue( ...@@ -403,7 +403,7 @@ int QkvToContextPluginDynamic::enqueue(
TransposeQKV( TransposeQKV(
batch, seq_len, head_size_, head_number_, input0_data, tptr, stream); batch, seq_len, head_size_, head_number_, input0_data, tptr, stream);
auto *device_ctx = static_cast<platform::CUDADeviceContext *>( auto *device_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get( platform::DeviceContextPool::Instance().Get(
platform::CUDAPlace(device_id))); platform::CUDAPlace(device_id)));
...@@ -414,7 +414,7 @@ int QkvToContextPluginDynamic::enqueue( ...@@ -414,7 +414,7 @@ int QkvToContextPluginDynamic::enqueue(
apply_scale<<<blocks, threads, 0, stream>>>( apply_scale<<<blocks, threads, 0, stream>>>(
tptr, static_cast<half>(scale_), n_q); tptr, static_cast<half>(scale_), n_q);
const platform::CUDADeviceContext &dev_ctx = *device_ctx; const phi::GPUContext &dev_ctx = *device_ctx;
operators::math::MultiHeadGPUComputeFunctor<half> multihead_compute_func; operators::math::MultiHeadGPUComputeFunctor<half> multihead_compute_func;
multihead_compute_func(dev_ctx, multihead_compute_func(dev_ctx,
batch, batch,
......
...@@ -34,7 +34,7 @@ namespace tensorrt { ...@@ -34,7 +34,7 @@ namespace tensorrt {
class TensorRTDynamicEngineTest : public ::testing::Test { class TensorRTDynamicEngineTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get()); .get());
...@@ -94,7 +94,7 @@ class TensorRTDynamicEngineTest : public ::testing::Test { ...@@ -94,7 +94,7 @@ class TensorRTDynamicEngineTest : public ::testing::Test {
framework::Tensor input_; framework::Tensor input_;
framework::Tensor output_; framework::Tensor output_;
TensorRTEngine *engine_; TensorRTEngine *engine_;
platform::CUDADeviceContext *ctx_; phi::GPUContext *ctx_;
}; };
TEST_F(TensorRTDynamicEngineTest, test_spmm) { TEST_F(TensorRTDynamicEngineTest, test_spmm) {
...@@ -199,7 +199,7 @@ TEST_F(TensorRTDynamicEngineTest, test_spmm) { ...@@ -199,7 +199,7 @@ TEST_F(TensorRTDynamicEngineTest, test_spmm) {
class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test { class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get()); .get());
...@@ -279,7 +279,7 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test { ...@@ -279,7 +279,7 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
std::vector<framework::Tensor> inputs_; std::vector<framework::Tensor> inputs_;
std::vector<framework::Tensor> outputs_; std::vector<framework::Tensor> outputs_;
TensorRTEngine *engine_; TensorRTEngine *engine_;
platform::CUDADeviceContext *ctx_; phi::GPUContext *ctx_;
}; };
TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) { TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) {
......
...@@ -26,7 +26,7 @@ namespace tensorrt { ...@@ -26,7 +26,7 @@ namespace tensorrt {
class TensorRTEngineTest : public ::testing::Test { class TensorRTEngineTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get()); .get());
...@@ -69,7 +69,7 @@ class TensorRTEngineTest : public ::testing::Test { ...@@ -69,7 +69,7 @@ class TensorRTEngineTest : public ::testing::Test {
framework::Tensor input_; framework::Tensor input_;
framework::Tensor output_; framework::Tensor output_;
TensorRTEngine *engine_; TensorRTEngine *engine_;
platform::CUDADeviceContext *ctx_; phi::GPUContext *ctx_;
}; };
TEST_F(TensorRTEngineTest, add_layer) { TEST_F(TensorRTEngineTest, add_layer) {
......
...@@ -44,7 +44,7 @@ TEST(BestFitAllocator, concurrent_cuda) { ...@@ -44,7 +44,7 @@ TEST(BestFitAllocator, concurrent_cuda) {
std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get()))); std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));
platform::CUDAPlace gpu(0); platform::CUDAPlace gpu(0);
platform::CUDADeviceContext dev_ctx(gpu); phi::GPUContext dev_ctx(gpu);
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu, dev_ctx.stream()) .GetAllocator(gpu, dev_ctx.stream())
.get()); .get());
...@@ -64,8 +64,7 @@ TEST(BestFitAllocator, concurrent_cuda) { ...@@ -64,8 +64,7 @@ TEST(BestFitAllocator, concurrent_cuda) {
size_t* data = reinterpret_cast<size_t*>(allocation->ptr()); size_t* data = reinterpret_cast<size_t*>(allocation->ptr());
ForEachFill fill(data); ForEachFill fill(data);
platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, platform::ForRange<phi::GPUContext> for_range(dev_ctx, allocate_size);
allocate_size);
for_range(fill); for_range(fill);
memory::Copy(platform::CPUPlace(), memory::Copy(platform::CPUPlace(),
......
...@@ -29,53 +29,51 @@ namespace memory { ...@@ -29,53 +29,51 @@ namespace memory {
namespace allocation { namespace allocation {
/** /**
* CUDADeviceContextAllocation is a wrapper of the underbeneath allocation. * GPUContextAllocation is a wrapper of the underbeneath allocation.
* CUDADeviceContextAllocation adds a CUDA stream callback for the underbeneath * GPUContextAllocation adds a CUDA stream callback for the underbeneath
* allocation so that CUDADeviceContextAllocation can be used in a CUDA stream * allocation so that GPUContextAllocation can be used in a CUDA stream
* which deletes allocation in the callback. * which deletes allocation in the callback.
*/ */
class CUDADeviceContextAllocation : public Allocation { class GPUContextAllocation : public Allocation {
public: public:
explicit CUDADeviceContextAllocation(DecoratedAllocationPtr allocation) explicit GPUContextAllocation(DecoratedAllocationPtr allocation)
: Allocation(allocation->ptr(), : Allocation(allocation->ptr(),
allocation->base_ptr(), allocation->base_ptr(),
allocation->size(), allocation->size(),
allocation->place()), allocation->place()),
underlying_allocation_(std::move(allocation)) {} underlying_allocation_(std::move(allocation)) {}
~CUDADeviceContextAllocation() { ~GPUContextAllocation() {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
dev_ctx_, dev_ctx_,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Device context is not set for CUDADeviceContextAllocation")); "Device context is not set for GPUContextAllocation"));
auto *p_allocation = underlying_allocation_.release(); auto *p_allocation = underlying_allocation_.release();
VLOG(4) << "Adding callback to delete CUDADeviceContextAllocation at " VLOG(4) << "Adding callback to delete GPUContextAllocation at "
<< p_allocation; << p_allocation;
dev_ctx_->AddStreamCallback([p_allocation] { dev_ctx_->AddStreamCallback([p_allocation] {
VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation; VLOG(4) << "Delete GPUContextAllocation at " << p_allocation;
Allocator::AllocationDeleter(p_allocation); Allocator::AllocationDeleter(p_allocation);
}); });
} }
void SetCUDADeviceContext(const platform::CUDADeviceContext *dev_ctx) { void SetGPUContext(const phi::GPUContext *dev_ctx) { dev_ctx_ = dev_ctx; }
dev_ctx_ = dev_ctx;
}
private: private:
DecoratedAllocationPtr underlying_allocation_; DecoratedAllocationPtr underlying_allocation_;
const platform::CUDADeviceContext *dev_ctx_{nullptr}; const phi::GPUContext *dev_ctx_{nullptr};
}; };
/** /**
* CUDADeviceContextAllocator will allocate a CUDADeviceContextAllocation * GPUContextAllocator will allocate a GPUContextAllocation
* after waiting for a self-created event on the default stream. It does so to * after waiting for a self-created event on the default stream. It does so to
* let the non-default stream be able to allocate GPU memory which will be * let the non-default stream be able to allocate GPU memory which will be
* released by stream callback * released by stream callback
*/ */
class CUDADeviceContextAllocator : public Allocator { class GPUContextAllocator : public Allocator {
public: public:
explicit CUDADeviceContextAllocator(platform::CUDAPlace place, explicit GPUContextAllocator(platform::CUDAPlace place,
gpuStream_t default_stream) gpuStream_t default_stream)
: place_(place), default_stream_(default_stream) { : place_(place), default_stream_(default_stream) {
platform::CUDADeviceGuard guard(place_.device); platform::CUDADeviceGuard guard(place_.device);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -87,7 +85,7 @@ class CUDADeviceContextAllocator : public Allocator { ...@@ -87,7 +85,7 @@ class CUDADeviceContextAllocator : public Allocator {
#endif #endif
} }
~CUDADeviceContextAllocator() { ~GPUContextAllocator() {
if (event_) { if (event_) {
platform::CUDADeviceGuard guard(place_.device); platform::CUDADeviceGuard guard(place_.device);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -103,9 +101,9 @@ class CUDADeviceContextAllocator : public Allocator { ...@@ -103,9 +101,9 @@ class CUDADeviceContextAllocator : public Allocator {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
default_stream_, default_stream_,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Default stream is not set for CUDADeviceContextAllocator")); "Default stream is not set for GPUContextAllocator"));
platform::CUDADeviceGuard guard(place_.device); platform::CUDADeviceGuard guard(place_.device);
auto allocation = new CUDADeviceContextAllocation( auto allocation = new GPUContextAllocation(
static_unique_ptr_cast<Allocation>(memory::Alloc(place_, size))); static_unique_ptr_cast<Allocation>(memory::Alloc(place_, size)));
// Wait for the event on stream // Wait for the event on stream
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -127,20 +125,20 @@ class CUDADeviceContextAllocator : public Allocator { ...@@ -127,20 +125,20 @@ class CUDADeviceContextAllocator : public Allocator {
}; };
/** /**
* CUDADeviceContextAllocatorPool is a singletion stores mapping from * GPUContextAllocatorPool is a singletion stores mapping from
* CUDAPlace(s) to std::shared_ptr<CUDADeviceContextAllocator>. When a * CUDAPlace(s) to std::shared_ptr<GPUContextAllocator>. When a
* CUDADeviceContext's compute stream isn't default stream, it can call this * phi::GPUContext's compute stream isn't default stream, it can call this
* class to allocate GPU memory which will be released by a callback after * class to allocate GPU memory which will be released by a callback after
* stream execution. * stream execution.
*/ */
class CUDADeviceContextAllocatorPool { class GPUContextAllocatorPool {
public: public:
static CUDADeviceContextAllocatorPool &Instance() { static GPUContextAllocatorPool &Instance() {
static CUDADeviceContextAllocatorPool pool; static GPUContextAllocatorPool pool;
return pool; return pool;
} }
AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) { AllocationPtr Alloc(const phi::GPUContext &dev_ctx, size_t size) {
auto iter = auto iter =
allocators_.find(platform::CUDAPlace(dev_ctx.GetPlace().GetDeviceId())); allocators_.find(platform::CUDAPlace(dev_ctx.GetPlace().GetDeviceId()));
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
...@@ -149,25 +147,25 @@ class CUDADeviceContextAllocatorPool { ...@@ -149,25 +147,25 @@ class CUDADeviceContextAllocatorPool {
platform::errors::NotFound("No allocator found for CUDAPlace.")); platform::errors::NotFound("No allocator found for CUDAPlace."));
auto &allocator = iter->second; auto &allocator = iter->second;
AllocationPtr allocation = allocator->Allocate(size); AllocationPtr allocation = allocator->Allocate(size);
static_cast<CUDADeviceContextAllocation *>(allocation.get()) static_cast<GPUContextAllocation *>(allocation.get())
->SetCUDADeviceContext(&dev_ctx); ->SetGPUContext(&dev_ctx);
return allocation; return allocation;
} }
private: private:
CUDADeviceContextAllocatorPool() { GPUContextAllocatorPool() {
std::vector<int> devices = platform::GetSelectedDevices(); std::vector<int> devices = platform::GetSelectedDevices();
for (int i : devices) { for (int i : devices) {
auto place = platform::CUDAPlace(i); auto place = platform::CUDAPlace(i);
auto compute_stream = auto compute_stream =
platform::DeviceContextPool::Instance().GetByPlace(place)->stream(); platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
auto allocator = std::shared_ptr<CUDADeviceContextAllocator>( auto allocator = std::shared_ptr<GPUContextAllocator>(
new CUDADeviceContextAllocator(place, compute_stream)); new GPUContextAllocator(place, compute_stream));
allocators_.insert(make_pair(place, allocator)); allocators_.insert(make_pair(place, allocator));
} }
} }
std::map<platform::CUDAPlace, std::shared_ptr<CUDADeviceContextAllocator>> std::map<platform::CUDAPlace, std::shared_ptr<GPUContextAllocator>>
allocators_; allocators_;
}; };
......
...@@ -37,7 +37,7 @@ const int NUM_STREAMS = 8; ...@@ -37,7 +37,7 @@ const int NUM_STREAMS = 8;
const int N = 2; const int N = 2;
const float DELTA = 1e-1; const float DELTA = 1e-1;
using CudaDevCtxVec = std::vector<std::unique_ptr<platform::CUDADeviceContext>>; using CudaDevCtxVec = std::vector<std::unique_ptr<phi::GPUContext>>;
__global__ void kernel(float *x, int n) { __global__ void kernel(float *x, int n) {
int tid = threadIdx.x + blockIdx.x * blockDim.x; int tid = threadIdx.x + blockIdx.x * blockDim.x;
...@@ -65,7 +65,7 @@ void CheckKernelOutput(float *x, int n) { ...@@ -65,7 +65,7 @@ void CheckKernelOutput(float *x, int n) {
void MultiStreamCompute(float **data, void MultiStreamCompute(float **data,
float **second_data, float **second_data,
const platform::CUDADeviceContext &ctx) { const phi::GPUContext &ctx) {
// multi-streams // multi-streams
AllocationPtr allocation_ptr = Alloc(ctx, N * sizeof(float)); AllocationPtr allocation_ptr = Alloc(ctx, N * sizeof(float));
EXPECT_GE(allocation_ptr->size(), N * sizeof(float)); EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
...@@ -88,7 +88,7 @@ void MultiStreamCompute(float **data, ...@@ -88,7 +88,7 @@ void MultiStreamCompute(float **data,
#endif #endif
} }
TEST(Malloc, CUDADeviceContextMultiStream) { TEST(Malloc, GPUContextMultiStream) {
auto place = platform::CUDAPlace(0); auto place = platform::CUDAPlace(0);
platform::SetDeviceId(0); platform::SetDeviceId(0);
...@@ -110,8 +110,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) { ...@@ -110,8 +110,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
main_stream_alloc_ptr.reset(); main_stream_alloc_ptr.reset();
for (int i = 0; i < NUM_STREAMS; ++i) { for (int i = 0; i < NUM_STREAMS; ++i) {
auto ctx = std::unique_ptr<platform::CUDADeviceContext>( auto ctx = std::unique_ptr<phi::GPUContext>(new phi::GPUContext(place));
new platform::CUDADeviceContext(place));
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream()) .GetAllocator(place, ctx->stream())
.get()); .get());
...@@ -143,7 +142,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) { ...@@ -143,7 +142,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
} }
} }
TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) { TEST(Malloc, GPUContextMultiThreadMultiStream) {
auto place = platform::CUDAPlace(0); auto place = platform::CUDAPlace(0);
platform::SetDeviceId(0); platform::SetDeviceId(0);
...@@ -166,8 +165,7 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) { ...@@ -166,8 +165,7 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
main_stream_alloc_ptr.reset(); main_stream_alloc_ptr.reset();
for (int i = 0; i < NUM_STREAMS; ++i) { for (int i = 0; i < NUM_STREAMS; ++i) {
auto ctx = std::unique_ptr<platform::CUDADeviceContext>( auto ctx = std::unique_ptr<phi::GPUContext>(new phi::GPUContext(place));
new platform::CUDADeviceContext(place));
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream()) .GetAllocator(place, ctx->stream())
.get()); .get());
......
...@@ -65,7 +65,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { ...@@ -65,7 +65,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
allocation_implicit_stream.reset(); allocation_implicit_stream.reset();
gpuStream_t default_stream = gpuStream_t default_stream =
dynamic_cast<platform::CUDADeviceContext *>( dynamic_cast<phi::GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(place)) paddle::platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
allocation::AllocationPtr allocation_unique = allocation::AllocationPtr allocation_unique =
...@@ -143,7 +143,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) { ...@@ -143,7 +143,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
size_t alloc_size = 256; size_t alloc_size = 256;
gpuStream_t default_stream = gpuStream_t default_stream =
dynamic_cast<platform::CUDADeviceContext *>( dynamic_cast<phi::GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(place)) paddle::platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
std::shared_ptr<Allocation> allocation_implicit_stream = std::shared_ptr<Allocation> allocation_implicit_stream =
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using framework::Tensor; using framework::Tensor;
using phi::GPUContext;
using platform::ActivationDescriptor; using platform::ActivationDescriptor;
using platform::CUDADeviceContext;
using platform::TensorDescriptor; using platform::TensorDescriptor;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -39,12 +39,12 @@ template <typename T> ...@@ -39,12 +39,12 @@ template <typename T>
struct CudnnActivationFunctor { struct CudnnActivationFunctor {
using ELEMENT_TYPE = T; using ELEMENT_TYPE = T;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
CudnnActivationFunctor(const CUDADeviceContext& ctx, CudnnActivationFunctor(const phi::GPUContext& ctx,
const T& c, const T& c,
const miopenActivationMode_t& m) const miopenActivationMode_t& m)
: ctx_(ctx), coef_(c), mode_(m) {} : ctx_(ctx), coef_(c), mode_(m) {}
#else #else
CudnnActivationFunctor(const CUDADeviceContext& ctx, CudnnActivationFunctor(const phi::GPUContext& ctx,
const T& c, const T& c,
const cudnnActivationMode_t& m) const cudnnActivationMode_t& m)
: ctx_(ctx), coef_(c), mode_(m) {} : ctx_(ctx), coef_(c), mode_(m) {}
...@@ -77,7 +77,7 @@ struct CudnnActivationFunctor { ...@@ -77,7 +77,7 @@ struct CudnnActivationFunctor {
out->mutable_data<T>(ctx_.GetPlace()))); out->mutable_data<T>(ctx_.GetPlace())));
#endif #endif
} }
const CUDADeviceContext& ctx_; const phi::GPUContext& ctx_;
const T coef_; const T coef_;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
const miopenActivationMode_t mode_; const miopenActivationMode_t mode_;
...@@ -90,12 +90,12 @@ template <typename T> ...@@ -90,12 +90,12 @@ template <typename T>
struct CudnnActivationGradFunctor { struct CudnnActivationGradFunctor {
using ELEMENT_TYPE = T; using ELEMENT_TYPE = T;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
CudnnActivationGradFunctor(const CUDADeviceContext& ctx, CudnnActivationGradFunctor(const phi::GPUContext& ctx,
const T& c, const T& c,
const miopenActivationMode_t& m) const miopenActivationMode_t& m)
: ctx_(ctx), coef_(c), mode_(m) {} : ctx_(ctx), coef_(c), mode_(m) {}
#else #else
CudnnActivationGradFunctor(const CUDADeviceContext& ctx, CudnnActivationGradFunctor(const phi::GPUContext& ctx,
const T& c, const T& c,
const cudnnActivationMode_t& m) const cudnnActivationMode_t& m)
: ctx_(ctx), coef_(c), mode_(m) {} : ctx_(ctx), coef_(c), mode_(m) {}
...@@ -141,7 +141,7 @@ struct CudnnActivationGradFunctor { ...@@ -141,7 +141,7 @@ struct CudnnActivationGradFunctor {
dx->mutable_data<T>(ctx_.GetPlace()))); dx->mutable_data<T>(ctx_.GetPlace())));
#endif #endif
} }
const CUDADeviceContext& ctx_; const phi::GPUContext& ctx_;
const T coef_; const T coef_;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
const miopenActivationMode_t mode_; const miopenActivationMode_t mode_;
...@@ -152,12 +152,12 @@ struct CudnnActivationGradFunctor { ...@@ -152,12 +152,12 @@ struct CudnnActivationGradFunctor {
template <typename T> template <typename T>
struct CudnnReluFunctor : public CudnnActivationFunctor<T> { struct CudnnReluFunctor : public CudnnActivationFunctor<T> {
explicit CudnnReluFunctor(const CUDADeviceContext& ctx) explicit CudnnReluFunctor(const phi::GPUContext& ctx)
: CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {} : CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}
}; };
template <typename T> template <typename T>
struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> { struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx) explicit CudnnReluGradFunctor(const phi::GPUContext& ctx)
: CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {} : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}
static constexpr ActBwdOpFwdDeps FwdDeps() { static constexpr ActBwdOpFwdDeps FwdDeps() {
...@@ -167,12 +167,12 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> { ...@@ -167,12 +167,12 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
template <typename T> template <typename T>
struct CudnnRelu6Functor : public CudnnActivationFunctor<T> { struct CudnnRelu6Functor : public CudnnActivationFunctor<T> {
explicit CudnnRelu6Functor(const CUDADeviceContext& ctx) explicit CudnnRelu6Functor(const phi::GPUContext& ctx)
: CudnnActivationFunctor<T>(ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {} : CudnnActivationFunctor<T>(ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {}
}; };
template <typename T> template <typename T>
struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> { struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx) explicit CudnnRelu6GradFunctor(const phi::GPUContext& ctx)
: CudnnActivationGradFunctor<T>( : CudnnActivationGradFunctor<T>(
ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {} ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {}
...@@ -183,12 +183,12 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> { ...@@ -183,12 +183,12 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
template <typename T> template <typename T>
struct CudnnSigmoidFunctor : public CudnnActivationFunctor<T> { struct CudnnSigmoidFunctor : public CudnnActivationFunctor<T> {
explicit CudnnSigmoidFunctor(const CUDADeviceContext& ctx) explicit CudnnSigmoidFunctor(const phi::GPUContext& ctx)
: CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {} : CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}
}; };
template <typename T> template <typename T>
struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> { struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx) explicit CudnnSigmoidGradFunctor(const phi::GPUContext& ctx)
: CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {} : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}
static constexpr ActBwdOpFwdDeps FwdDeps() { static constexpr ActBwdOpFwdDeps FwdDeps() {
...@@ -198,12 +198,12 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> { ...@@ -198,12 +198,12 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
template <typename T> template <typename T>
struct CudnnTanhFunctor : public CudnnActivationFunctor<T> { struct CudnnTanhFunctor : public CudnnActivationFunctor<T> {
explicit CudnnTanhFunctor(const CUDADeviceContext& ctx) explicit CudnnTanhFunctor(const phi::GPUContext& ctx)
: CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {} : CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}
}; };
template <typename T> template <typename T>
struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> { struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx) explicit CudnnTanhGradFunctor(const phi::GPUContext& ctx)
: CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {} : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}
static constexpr ActBwdOpFwdDeps FwdDeps() { static constexpr ActBwdOpFwdDeps FwdDeps() {
...@@ -221,7 +221,7 @@ class CudnnActivationKernel ...@@ -221,7 +221,7 @@ class CudnnActivationKernel
framework::Tensor* Out = nullptr; framework::Tensor* Out = nullptr;
ExtractActivationTensor(context, &X, &Out); ExtractActivationTensor(context, &X, &Out);
Out->mutable_data<T>(context.GetPlace()); Out->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<CUDADeviceContext>(); auto& dev_ctx = context.template device_context<phi::GPUContext>();
Functor functor(dev_ctx); Functor functor(dev_ctx);
functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivation"), Out); functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivation"), Out);
} }
...@@ -242,7 +242,7 @@ class CudnnActivationGradKernel ...@@ -242,7 +242,7 @@ class CudnnActivationGradKernel
ExtractActivationGradTensor<Functor::FwdDeps()>( ExtractActivationGradTensor<Functor::FwdDeps()>(
context, &X, &Out, &dOut, &dX); context, &X, &Out, &dOut, &dX);
dX->mutable_data<T>(context.GetPlace()); dX->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<CUDADeviceContext>(); auto& dev_ctx = context.template device_context<phi::GPUContext>();
Functor functor(dev_ctx); Functor functor(dev_ctx);
functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivationGrad"), functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivationGrad"),
GET_DATA_SAFELY(Out, "Input", "Out", "CudnnActivationGrad"), GET_DATA_SAFELY(Out, "Input", "Out", "CudnnActivationGrad"),
......
...@@ -194,87 +194,74 @@ using CudaELUGradNegativeAlphaFunctor = ...@@ -194,87 +194,74 @@ using CudaELUGradNegativeAlphaFunctor =
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
#define REGISTER_ACTIVATION_CUDA_KERNEL( \ #define REGISTER_ACTIVATION_CUDA_KERNEL( \
act_type, op_name, functor, grad_functor) \ act_type, op_name, functor, grad_functor) \
REGISTER_OP_CUDA_KERNEL( \ REGISTER_OP_CUDA_KERNEL( \
act_type, \ act_type, \
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \ ops::ActivationCudaKernel<phi::GPUContext, ops::functor<float>>, \
ops::functor<float>>, \ ops::ActivationCudaKernel<phi::GPUContext, ops::functor<double>>, \
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \ ops::ActivationCudaKernel<phi::GPUContext, ops::functor<plat::float16>>, \
ops::functor<double>>, \ ops::ActivationCudaKernel<phi::GPUContext, \
ops::ActivationCudaKernel<plat::CUDADeviceContext, \ ops::functor<plat::bfloat16>>); \
ops::functor<plat::float16>>, \ REGISTER_OP_CUDA_KERNEL( \
ops::ActivationCudaKernel<plat::CUDADeviceContext, \ act_type##_grad, \
ops::functor<plat::bfloat16>>); \ ops::ActivationGradCudaKernel<phi::GPUContext, \
REGISTER_OP_CUDA_KERNEL( \ ops::grad_functor<float>>, \
act_type##_grad, \ ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \ ops::grad_functor<double>>, \
ops::grad_functor<float>>, \ ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \ ops::grad_functor<plat::float16>>, \
ops::grad_functor<double>>, \ ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::bfloat16>>); ops::grad_functor<plat::bfloat16>>);
#define REGISTER_ACTIVATION_CUDA_KERNEL_INT( \ #define REGISTER_ACTIVATION_CUDA_KERNEL_INT( \
act_type, op_name, functor, grad_functor) \ act_type, op_name, functor, grad_functor) \
REGISTER_OP_CUDA_KERNEL( \ REGISTER_OP_CUDA_KERNEL( \
act_type, \ act_type, \
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \ ops::ActivationCudaKernel<phi::GPUContext, ops::functor<float>>, \
ops::functor<float>>, \ ops::ActivationCudaKernel<phi::GPUContext, ops::functor<double>>, \
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \ ops::ActivationCudaKernel<phi::GPUContext, ops::functor<int>>, \
ops::functor<double>>, \ ops::ActivationCudaKernel<phi::GPUContext, ops::functor<int64_t>>, \
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \ ops::ActivationCudaKernel<phi::GPUContext, ops::functor<plat::float16>>, \
ops::functor<int>>, \ ops::ActivationCudaKernel<phi::GPUContext, \
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \ ops::functor<plat::bfloat16>>); \
ops::functor<int64_t>>, \ REGISTER_OP_CUDA_KERNEL( \
ops::ActivationCudaKernel<plat::CUDADeviceContext, \ act_type##_grad, \
ops::functor<plat::float16>>, \ ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::ActivationCudaKernel<plat::CUDADeviceContext, \ ops::grad_functor<float>>, \
ops::functor<plat::bfloat16>>); \ ops::ActivationGradCudaKernel<phi::GPUContext, \
REGISTER_OP_CUDA_KERNEL( \ ops::grad_functor<double>>, \
act_type##_grad, \ ops::ActivationGradCudaKernel<phi::GPUContext, ops::grad_functor<int>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \ ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::grad_functor<float>>, \ ops::grad_functor<int64_t>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \ ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::grad_functor<double>>, \ ops::grad_functor<plat::float16>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \ ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::grad_functor<int>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<int64_t>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::bfloat16>>); ops::grad_functor<plat::bfloat16>>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
relu6, relu6,
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<float>>,
ops::CudaRelu6Functor<float>>, ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<double>>,
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<int>>,
ops::CudaRelu6Functor<double>>, ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<int64_t>>,
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, ops::ActivationCudaKernel<phi::GPUContext,
ops::CudaRelu6Functor<int>>,
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
ops::CudaRelu6Functor<int64_t>>,
ops::ActivationCudaKernel<plat::CUDADeviceContext,
ops::CudaRelu6Functor<plat::float16>>, ops::CudaRelu6Functor<plat::float16>>,
ops::ActivationCudaKernel<plat::CUDADeviceContext, ops::ActivationCudaKernel<phi::GPUContext,
ops::CudaRelu6Functor<plat::bfloat16>>); ops::CudaRelu6Functor<plat::bfloat16>>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
relu6_grad, relu6_grad,
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, ops::ActivationGradCudaKernel<phi::GPUContext,
ops::CudaRelu6GradFunctor<float>>, ops::CudaRelu6GradFunctor<float>>,
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, ops::ActivationGradCudaKernel<phi::GPUContext,
ops::CudaRelu6GradFunctor<double>>, ops::CudaRelu6GradFunctor<double>>,
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, ops::ActivationGradCudaKernel<phi::GPUContext,
ops::CudaRelu6GradFunctor<int>>, ops::CudaRelu6GradFunctor<int>>,
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, ops::ActivationGradCudaKernel<phi::GPUContext,
ops::CudaRelu6GradFunctor<int64_t>>, ops::CudaRelu6GradFunctor<int64_t>>,
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, ops::ActivationGradCudaKernel<phi::GPUContext,
ops::CudaRelu6GradFunctor<plat::float16>>, ops::CudaRelu6GradFunctor<plat::float16>>,
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, ops::ActivationGradCudaKernel<phi::GPUContext,
ops::CudaRelu6GradFunctor<plat::bfloat16>>); ops::CudaRelu6GradFunctor<plat::bfloat16>>);
#define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ #define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \
......
...@@ -211,7 +211,7 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> { ...@@ -211,7 +211,7 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext; using CUDA = phi::GPUContext;
REGISTER_OP_CUDA_KERNEL(affine_channel, REGISTER_OP_CUDA_KERNEL(affine_channel,
ops::AffineChannelCUDAKernel<CUDA, float>, ops::AffineChannelCUDAKernel<CUDA, float>,
......
...@@ -35,7 +35,7 @@ class CUDNNAffineGridOpKernel : public framework::OpKernel<T> { ...@@ -35,7 +35,7 @@ class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Only support for CUDAPlace.Please switch your context from " "Only support for CUDAPlace.Please switch your context from "
"CPUPlace to CUDAPlace or update your cudnn.")); "CPUPlace to CUDAPlace or update your cudnn."));
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
auto* theta = ctx.Input<Tensor>("Theta"); auto* theta = ctx.Input<Tensor>("Theta");
auto* output = ctx.Output<Tensor>("Output"); auto* output = ctx.Output<Tensor>("Output");
...@@ -83,7 +83,7 @@ class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> { ...@@ -83,7 +83,7 @@ class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
"support for CUDAPlace. Please switch " "support for CUDAPlace. Please switch "
"your context from CPUPlace to " "your context from CPUPlace to "
"CUDAPlace or update your cudnn.")); "CUDAPlace or update your cudnn."));
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output")); auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta")); auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
......
...@@ -29,7 +29,7 @@ __global__ void LinspaceKernel(T start, T step, int64_t size, T* out) { ...@@ -29,7 +29,7 @@ __global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
} }
template <typename T> template <typename T>
struct Linspace<paddle::platform::CUDADeviceContext, T> { struct Linspace<phi::GPUContext, T> {
void operator()(T start, void operator()(T start,
T end, T end,
int count, int count,
...@@ -191,7 +191,7 @@ class AffineGridGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -191,7 +191,7 @@ class AffineGridGradOpCUDAKernel : public framework::OpKernel<T> {
w = size_attr[3]; w = size_attr[3];
} }
T* theta_grad_data = theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace()); T* theta_grad_data = theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()( phi::funcs::SetConstant<phi::GPUContext, T>()(
ctx.cuda_device_context(), theta_grad, static_cast<T>(0)); ctx.cuda_device_context(), theta_grad, static_cast<T>(0));
T h_step; T h_step;
......
...@@ -82,7 +82,7 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> { ...@@ -82,7 +82,7 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const { void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X"); const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* scale = ctx.Input<framework::Tensor>("Scale"); const auto* scale = ctx.Input<framework::Tensor>("Scale");
auto outs = ctx.MultiOutput<framework::Tensor>("Out"); auto outs = ctx.MultiOutput<framework::Tensor>("Out");
...@@ -92,8 +92,7 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> { ...@@ -92,8 +92,7 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace()); bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
framework::Tensor inverse_scale = framework::Tensor inverse_scale =
ctx.AllocateTmpTensor<MPDType, platform::CUDADeviceContext>({1}, ctx.AllocateTmpTensor<MPDType, phi::GPUContext>({1}, dev_ctx);
dev_ctx);
MPDType* inverse_scale_v = inverse_scale.template data<MPDType>(); MPDType* inverse_scale_v = inverse_scale.template data<MPDType>();
InverseAndMemset<MPDType><<<1, 1, 0, dev_ctx.stream()>>>( InverseAndMemset<MPDType><<<1, 1, 0, dev_ctx.stream()>>>(
......
...@@ -87,11 +87,9 @@ __global__ void FusedFillIf(T** outs, ...@@ -87,11 +87,9 @@ __global__ void FusedFillIf(T** outs,
} }
template <typename T, bool IsFoundInfOnCPU> template <typename T, bool IsFoundInfOnCPU>
class UpdateLossScalingFunctor<platform::CUDADeviceContext, class UpdateLossScalingFunctor<phi::GPUContext, T, IsFoundInfOnCPU> {
T,
IsFoundInfOnCPU> {
public: public:
void operator()(const platform::CUDADeviceContext& dev_ctx, void operator()(const phi::GPUContext& dev_ctx,
const bool* found_inf_data, const bool* found_inf_data,
const T* pre_loss_scaling_data, const T* pre_loss_scaling_data,
const int* good_in_data, const int* good_in_data,
...@@ -134,9 +132,9 @@ class UpdateLossScalingFunctor<platform::CUDADeviceContext, ...@@ -134,9 +132,9 @@ class UpdateLossScalingFunctor<platform::CUDADeviceContext,
}; };
template <typename T> template <typename T>
class LazyZeros<platform::CUDADeviceContext, T> { class LazyZeros<phi::GPUContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& dev_ctx, void operator()(const phi::GPUContext& dev_ctx,
const bool* found_inf_data, const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs, const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const { const std::vector<framework::Tensor*>& outs) const {
...@@ -204,7 +202,7 @@ class LazyZeros<platform::CUDADeviceContext, T> { ...@@ -204,7 +202,7 @@ class LazyZeros<platform::CUDADeviceContext, T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
using GPU = paddle::platform::CUDADeviceContext; using GPU = phi::GPUContext;
REGISTER_OP_CUDA_KERNEL(update_loss_scaling, REGISTER_OP_CUDA_KERNEL(update_loss_scaling,
ops::UpdateLossScalingKernel<GPU, float>, ops::UpdateLossScalingKernel<GPU, float>,
......
...@@ -54,7 +54,7 @@ struct ArrayToLoDFunctor : public std::unary_function<platform::Place, void> { ...@@ -54,7 +54,7 @@ struct ArrayToLoDFunctor : public std::unary_function<platform::Place, void> {
Apply(static_cast<phi::CPUContext *>(pool.Get(place))); Apply(static_cast<phi::CPUContext *>(pool.Get(place)));
} else { } else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Apply(static_cast<platform::CUDADeviceContext *>(pool.Get(place))); Apply(static_cast<phi::GPUContext *>(pool.Get(place)));
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unavailable("Paddle is not compiled with CUDA.")); platform::errors::Unavailable("Paddle is not compiled with CUDA."));
......
...@@ -82,8 +82,7 @@ class AssignPosCUDAKernel : public framework::OpKernel<T> { ...@@ -82,8 +82,7 @@ class AssignPosCUDAKernel : public framework::OpKernel<T> {
*eff_num_len, platform::CPUPlace(), &cpu_eff_num_len); *eff_num_len, platform::CPUPlace(), &cpu_eff_num_len);
cpu_eff_num_len_data = cpu_eff_num_len.data<T>()[0]; cpu_eff_num_len_data = cpu_eff_num_len.data<T>()[0];
} }
const auto& dev_ctx = const auto& dev_ctx = context.template device_context<phi::GPUContext>();
context.template device_context<platform::CUDADeviceContext>();
framework::DDim out_dims = phi::make_ddim({cpu_eff_num_len_data}); framework::DDim out_dims = phi::make_ddim({cpu_eff_num_len_data});
auto out_data = out->mutable_data<T>(out_dims, place); auto out_data = out->mutable_data<T>(out_dims, place);
......
...@@ -114,9 +114,9 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> { ...@@ -114,9 +114,9 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> {
T* out_data = output->mutable_data<T>(ctx.GetPlace()); T* out_data = output->mutable_data<T>(ctx.GetPlace());
// initialize // initialize
auto out_eigen = framework::EigenVector<T>::Flatten(*output); auto out_eigen = framework::EigenVector<T>::Flatten(*output);
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto& place = *ctx.template device_context<platform::CUDADeviceContext>() auto& place =
.eigen_device(); *ctx.template device_context<phi::GPUContext>().eigen_device();
out_eigen.device(place) = out_eigen.constant(static_cast<T>(0)); out_eigen.device(place) = out_eigen.constant(static_cast<T>(0));
CBLAS_TRANSPOSE transA = CblasNoTrans; CBLAS_TRANSPOSE transA = CblasNoTrans;
...@@ -127,7 +127,7 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> { ...@@ -127,7 +127,7 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> {
int64_t strideA = ins_num * in_dim; int64_t strideA = ins_num * in_dim;
int64_t strideB = in_dim * out_dim; int64_t strideB = in_dim * out_dim;
auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx); auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
blas.BatchedGEMM(transA, blas.BatchedGEMM(transA,
transB, transB,
ins_num, ins_num,
...@@ -169,9 +169,9 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -169,9 +169,9 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
auto in_dim = input_dims[2]; auto in_dim = input_dims[2];
auto out_dim = w_dims[2]; auto out_dim = w_dims[2];
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto& place = *ctx.template device_context<platform::CUDADeviceContext>() auto& place =
.eigen_device(); *ctx.template device_context<phi::GPUContext>().eigen_device();
// initialize // initialize
dx->mutable_data<T>(ctx.GetPlace()); dx->mutable_data<T>(ctx.GetPlace());
auto dx_eigen = framework::EigenVector<T>::Flatten(*dx); auto dx_eigen = framework::EigenVector<T>::Flatten(*dx);
...@@ -199,7 +199,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -199,7 +199,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
out_dim, out_dim,
db_data); db_data);
auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx); auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
T alpha = 1; T alpha = 1;
T beta = 0; T beta = 0;
...@@ -238,7 +238,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -238,7 +238,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
using GPUCtx = paddle::platform::CUDADeviceContext; using GPUCtx = phi::GPUContext;
REGISTER_OP_CUDA_KERNEL(batch_fc, REGISTER_OP_CUDA_KERNEL(batch_fc,
ops::BatchFCCUDAKernel<GPUCtx, float>, ops::BatchFCCUDAKernel<GPUCtx, float>,
ops::BatchFCCUDAKernel<GPUCtx, double>); ops::BatchFCCUDAKernel<GPUCtx, double>);
......
...@@ -17,9 +17,8 @@ limitations under the License. */ ...@@ -17,9 +17,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(beam_search,
beam_search, ops::BeamSearchOpKernel<phi::GPUContext, float>,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, float>, ops::BeamSearchOpKernel<phi::GPUContext, double>,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, double>, ops::BeamSearchOpKernel<phi::GPUContext, int>,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int>, ops::BeamSearchOpKernel<phi::GPUContext, int64_t>);
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
using CUDA = paddle::platform::CUDADeviceContext; using CUDA = phi::GPUContext;
// See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc
REGISTER_OP_CUDA_KERNEL(transfer_dtype, REGISTER_OP_CUDA_KERNEL(transfer_dtype,
ops::CastOpKernel<CUDA, float>, ops::CastOpKernel<CUDA, float>,
......
...@@ -150,7 +150,7 @@ class CenterLossCUDAKernel : public framework::OpKernel<T> { ...@@ -150,7 +150,7 @@ class CenterLossCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
using GPUCtx = paddle::platform::CUDADeviceContext; using GPUCtx = phi::GPUContext;
REGISTER_OP_CUDA_KERNEL(center_loss, REGISTER_OP_CUDA_KERNEL(center_loss,
ops::CenterLossCUDAKernel<GPUCtx, float>, ops::CenterLossCUDAKernel<GPUCtx, float>,
ops::CenterLossCUDAKernel<GPUCtx, double>); ops::CenterLossCUDAKernel<GPUCtx, double>);
......
...@@ -17,8 +17,7 @@ limitations under the License. */ ...@@ -17,8 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CUDADeviceContext = paddle::platform::CUDADeviceContext;
/* see [Why use single type kernel] */ /* see [Why use single type kernel] */
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
cinn_instruction_run, cinn_instruction_run,
ops::CinnInstructionRunOpKernel<CUDADeviceContext, float>); ops::CinnInstructionRunOpKernel<phi::GPUContext, float>);
...@@ -18,6 +18,4 @@ limitations under the License. */ ...@@ -18,6 +18,4 @@ limitations under the License. */
/* see [Why use single type kernel] */ /* see [Why use single type kernel] */
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
cinn_launch, cinn_launch, paddle::operators::CinnLaunchOpKernel<phi::GPUContext, float>);
paddle::operators::CinnLaunchOpKernel<paddle::platform::CUDADeviceContext,
float>);
...@@ -21,10 +21,8 @@ namespace paddle::operators::details { ...@@ -21,10 +21,8 @@ namespace paddle::operators::details {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
template <> template <>
void* GetStream<platform::CUDADeviceContext>( void* GetStream<phi::GPUContext>(const framework::ExecutionContext& ctx) {
const framework::ExecutionContext& ctx) { const auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
const auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
return dev_ctx.stream(); return dev_ctx.stream();
} }
#endif #endif
......
...@@ -40,8 +40,7 @@ void* GetStream(const framework::ExecutionContext& ctx) { ...@@ -40,8 +40,7 @@ void* GetStream(const framework::ExecutionContext& ctx) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
template <> template <>
void* GetStream<platform::CUDADeviceContext>( void* GetStream<phi::GPUContext>(const framework::ExecutionContext& ctx);
const framework::ExecutionContext& ctx);
#endif #endif
} // namespace details } // namespace details
......
...@@ -375,7 +375,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> { ...@@ -375,7 +375,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
platform::NCCLCommContext::Instance().Get(rid, ctx.GetPlace()); platform::NCCLCommContext::Instance().Get(rid, ctx.GetPlace());
// use global calculate stream // use global calculate stream
const auto calcu_stream = const auto calcu_stream =
static_cast<platform::CUDADeviceContext*>( static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(ctx.GetPlace())) platform::DeviceContextPool::Instance().Get(ctx.GetPlace()))
->stream(); ->stream();
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
...@@ -607,6 +607,5 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> { ...@@ -607,6 +607,5 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
class_center_sample, class_center_sample,
ops::ClassCenterSampleCUDAKernel<paddle::platform::CUDADeviceContext, ops::ClassCenterSampleCUDAKernel<phi::GPUContext, int64_t>,
int64_t>, ops::ClassCenterSampleCUDAKernel<phi::GPUContext, int>);
ops::ClassCenterSampleCUDAKernel<paddle::platform::CUDADeviceContext, int>);
...@@ -519,11 +519,10 @@ REGISTER_OP_CPU_KERNEL(coalesce_tensor, ...@@ -519,11 +519,10 @@ REGISTER_OP_CPU_KERNEL(coalesce_tensor,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
coalesce_tensor, coalesce_tensor,
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, ops::CoalesceTensorOpKernel<phi::GPUContext, plat::float16>,
plat::float16>, ops::CoalesceTensorOpKernel<phi::GPUContext, int>,
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, int>, ops::CoalesceTensorOpKernel<phi::GPUContext, float>,
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, float>, ops::CoalesceTensorOpKernel<phi::GPUContext, double>);
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL) #if defined(PADDLE_WITH_ASCEND_CL)
......
...@@ -17,10 +17,9 @@ limitations under the License. */ ...@@ -17,10 +17,9 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(allreduce,
allreduce, ops::AllReduceOpKernel<phi::GPUContext, float>,
ops::AllReduceOpKernel<plat::CUDADeviceContext, float>, ops::AllReduceOpKernel<phi::GPUContext, double>,
ops::AllReduceOpKernel<plat::CUDADeviceContext, double>, ops::AllReduceOpKernel<phi::GPUContext, int>,
ops::AllReduceOpKernel<plat::CUDADeviceContext, int>, ops::AllReduceOpKernel<phi::GPUContext, int64_t>,
ops::AllReduceOpKernel<plat::CUDADeviceContext, int64_t>, ops::AllReduceOpKernel<phi::GPUContext, plat::float16>);
ops::AllReduceOpKernel<plat::CUDADeviceContext, plat::float16>);
...@@ -38,7 +38,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> { ...@@ -38,7 +38,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"AllReduce op can run on gpu place only for now.")); "AllReduce op can run on gpu place only for now."));
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto in = ctx.Input<framework::Tensor>("X"); auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out"); auto out = ctx.Output<framework::Tensor>("Out");
......
...@@ -47,7 +47,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> { ...@@ -47,7 +47,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
gpuStream_t stream = nullptr; gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
} else { } else {
stream = comm->stream(); stream = comm->stream();
} }
......
...@@ -40,7 +40,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> { ...@@ -40,7 +40,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
int rid = ctx.Attr<int>("ring_id"); int rid = ctx.Attr<int>("ring_id");
auto comm = platform::NCCLCommContext::Instance().Get(rid, place); auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
auto stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); auto stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
ncclRedOp_t nccl_red_type = ncclSum; ncclRedOp_t nccl_red_type = ncclSum;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream)); sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
......
...@@ -54,7 +54,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> { ...@@ -54,7 +54,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
platform::errors::PreconditionNotMet("Currently, the broadcast op can " platform::errors::PreconditionNotMet("Currently, the broadcast op can "
"only be an In-Place operation.")); "only be an In-Place operation."));
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto comm = dev_ctx.nccl_comm(); auto comm = dev_ctx.nccl_comm();
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
......
...@@ -68,7 +68,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> { ...@@ -68,7 +68,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
gpuStream_t stream = nullptr; gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
} else { } else {
stream = comm->stream(); stream = comm->stream();
} }
......
...@@ -419,7 +419,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> { ...@@ -419,7 +419,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
gpuStream_t stream = nullptr; gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
} else { } else {
stream = comm->stream(); stream = comm->stream();
} }
......
...@@ -54,7 +54,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> { ...@@ -54,7 +54,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
gpuStream_t stream = nullptr; gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
} else { } else {
stream = comm->stream(); stream = comm->stream();
} }
......
...@@ -90,7 +90,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> { ...@@ -90,7 +90,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
T* recv_buff = temp_out.data<T>(); T* recv_buff = temp_out.data<T>();
gpuStream_t stream = nullptr; gpuStream_t stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::ncclAllGather(send_buff, platform::dynload::ncclAllGather(send_buff,
...@@ -113,9 +113,9 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> { ...@@ -113,9 +113,9 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
offset += rows_per_tensor; offset += rows_per_tensor;
} }
math::ConcatFunctor<platform::CUDADeviceContext, T> functor; math::ConcatFunctor<phi::GPUContext, T> functor;
out->mutable_data<T>(out_dims, place); out->mutable_data<T>(out_dims, place);
auto& dev_ctx2 = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx2 = ctx.template device_context<phi::GPUContext>();
functor(dev_ctx2, inputs, axis, out); functor(dev_ctx2, inputs, axis, out);
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
......
...@@ -91,8 +91,7 @@ class CEmbeddingCUDAKernel : public framework::OpKernel<T> { ...@@ -91,8 +91,7 @@ class CEmbeddingCUDAKernel : public framework::OpKernel<T> {
auto *ids_t = context.Input<LoDTensor>("Ids"); auto *ids_t = context.Input<LoDTensor>("Ids");
auto *output_t = context.Output<LoDTensor>("Out"); auto *output_t = context.Output<LoDTensor>("Out");
const auto &dev_ctx = const auto &dev_ctx = context.template device_context<phi::GPUContext>();
context.template device_context<platform::CUDADeviceContext>();
const int64_t start_idx = context.Attr<int64_t>("start_index"); const int64_t start_idx = context.Attr<int64_t>("start_index");
size_t N = table_t->dims()[0]; size_t N = table_t->dims()[0];
size_t D = table_t->dims()[1]; size_t D = table_t->dims()[1];
...@@ -142,8 +141,7 @@ template <typename T> ...@@ -142,8 +141,7 @@ template <typename T>
class CEmbeddingGradCUDAKernel : public framework::OpKernel<T> { class CEmbeddingGradCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &context) const override { void Compute(const framework::ExecutionContext &context) const override {
const auto &dev_ctx = const auto &dev_ctx = context.template device_context<phi::GPUContext>();
context.template device_context<platform::CUDADeviceContext>();
const int64_t start_idx = context.Attr<int64_t>("start_index"); const int64_t start_idx = context.Attr<int64_t>("start_index");
auto ids_t = context.Input<LoDTensor>("Ids"); auto ids_t = context.Input<LoDTensor>("Ids");
auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out")); auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
......
...@@ -312,7 +312,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> { ...@@ -312,7 +312,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
gpuStream_t stream = nullptr; gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
} else { } else {
stream = comm->stream(); stream = comm->stream();
} }
......
...@@ -55,7 +55,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> { ...@@ -55,7 +55,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
gpuStream_t stream = nullptr; gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
} else { } else {
stream = comm->stream(); stream = comm->stream();
} }
......
...@@ -61,7 +61,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> { ...@@ -61,7 +61,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
gpuStream_t stream = nullptr; gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
} else { } else {
stream = comm->stream(); stream = comm->stream();
} }
......
...@@ -108,10 +108,10 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> { ...@@ -108,10 +108,10 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
const auto& place = ctx.GetPlace(); const auto& place = ctx.GetPlace();
const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place); const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
// use global calculate stream // use global calculate stream
const auto stream = static_cast<platform::CUDADeviceContext*>( const auto stream = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place)) platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
...@@ -136,8 +136,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> { ...@@ -136,8 +136,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
// step 1, obtain logit_max // step 1, obtain logit_max
Tensor logits_max; Tensor logits_max;
logits_max = logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
void* logits_max_buff = logits_max.mutable_data<T>(place); void* logits_max_buff = logits_max.mutable_data<T>(place);
auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max); auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
...@@ -166,7 +165,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> { ...@@ -166,7 +165,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
// step 3, obtain predict target // step 3, obtain predict target
Tensor predicted_logits; Tensor predicted_logits;
predicted_logits = predicted_logits =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx); ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
predicted_logits.mutable_data<T>(place); predicted_logits.mutable_data<T>(place);
auto t = framework::EigenVector<T>::Flatten(predicted_logits); auto t = framework::EigenVector<T>::Flatten(predicted_logits);
...@@ -217,8 +216,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> { ...@@ -217,8 +216,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
// step 5, obtain sum_exp_logits // step 5, obtain sum_exp_logits
Tensor sum_exp_logits; Tensor sum_exp_logits;
sum_exp_logits = sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place); void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits); auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
...@@ -262,7 +260,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> { ...@@ -262,7 +260,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
const int rank = ctx.Attr<int>("rank"); const int rank = ctx.Attr<int>("rank");
const auto& place = ctx.GetPlace(); const auto& place = ctx.GetPlace();
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto map = distributed::ProcessGroupMapFromGid::getInstance(); auto map = distributed::ProcessGroupMapFromGid::getInstance();
distributed::ProcessGroup* pg = map->get(rid); distributed::ProcessGroup* pg = map->get(rid);
...@@ -290,8 +288,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> { ...@@ -290,8 +288,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
// step 1, obtain logit_max // step 1, obtain logit_max
Tensor logits_max; Tensor logits_max;
logits_max = logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max); auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
Eigen::DSizes<int, 1> along_axis(1); Eigen::DSizes<int, 1> along_axis(1);
...@@ -314,7 +311,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> { ...@@ -314,7 +311,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
// step 3, obtain predict target // step 3, obtain predict target
Tensor predicted_logits; Tensor predicted_logits;
predicted_logits = predicted_logits =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx); ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
predicted_logits.mutable_data<T>(place); predicted_logits.mutable_data<T>(place);
auto t = framework::EigenVector<T>::Flatten(predicted_logits); auto t = framework::EigenVector<T>::Flatten(predicted_logits);
...@@ -358,8 +355,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> { ...@@ -358,8 +355,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
// step 5, obtain sum_exp_logits // step 5, obtain sum_exp_logits
Tensor sum_exp_logits; Tensor sum_exp_logits;
sum_exp_logits = sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place); void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits); auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
...@@ -395,8 +391,7 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> { ...@@ -395,8 +391,7 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
context.Output<Tensor>(framework::GradVarName("Logits")); context.Output<Tensor>(framework::GradVarName("Logits"));
const Tensor* softmax = context.Input<Tensor>("Softmax"); const Tensor* softmax = context.Input<Tensor>("Softmax");
const int rank = context.Attr<int>("rank"); const int rank = context.Attr<int>("rank");
auto& dev_ctx = auto& dev_ctx = context.template device_context<phi::GPUContext>();
context.template device_context<platform::CUDADeviceContext>();
if (logit_grad != softmax) { if (logit_grad != softmax) {
framework::TensorCopy( framework::TensorCopy(
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册