未验证 提交 86763023 编写于 作者: L Leo Chen 提交者: GitHub

unify gpu context (#44740)

* remove cudaDeviceContext

* remove more template

* fix rocm compile

* remove alias name CUDADeviceContext

* fix compile

* fix tests

* revert changes
上级 f15d930a
......@@ -94,7 +94,7 @@ class NPUEventManager {
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"CUDADeviceContext's device %d does not match"
"phi::GPUContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
......
......@@ -104,7 +104,7 @@ class EventManager {
bool DeviceId() const { return device_index_; }
gpuEvent_t GetRawCudaEvent() const { return event_; }
void Record(const paddle::platform::CUDADeviceContext& ctx) {
void Record(const phi::GPUContext& ctx) {
auto device_index = ctx.GetPlace().device;
if (!is_created_) {
CreateEvent(device_index);
......@@ -112,7 +112,7 @@ class EventManager {
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"CUDADeviceContext's device %d does not match"
"phi::GPUContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
......@@ -157,13 +157,13 @@ class EventManager {
}
}
void Block(const paddle::platform::CUDADeviceContext& ctx) const {
void Block(const phi::GPUContext& ctx) const {
if (is_created_) {
auto device_index = ctx.GetPlace().device;
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"CUDADeviceContext's device %d does not match"
"phi::GPUContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
......
......@@ -31,10 +31,10 @@ namespace distributed {
void SyncDefaultStream(
const std::vector<Place>& places,
std::vector<EventManager>& ncclEvents, // NOLINT
std::vector<std::unique_ptr<CUDADeviceContext>>& dev_ctx) { // NOLINT
std::vector<EventManager>& ncclEvents, // NOLINT
std::vector<std::unique_ptr<phi::GPUContext>>& dev_ctx) { // NOLINT
for (size_t i = 0; i < places.size(); ++i) {
auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
auto* default_ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(places[i]));
ncclEvents[i].Record(*default_ctx);
ncclEvents[i].Block(*dev_ctx[i]);
......@@ -69,7 +69,7 @@ void ProcessGroupNCCL::NCCLTask::SetOutputs(
void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() {
for (size_t i = 0; i < places_.size(); ++i) {
auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
auto* default_ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(places_[i]));
default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent());
}
......@@ -201,7 +201,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
<< ", place: " << places_key
<< ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);
std::vector<std::unique_ptr<CUDADeviceContext>> dev_ctx;
std::vector<std::unique_ptr<phi::GPUContext>> dev_ctx;
dev_ctx.resize(places.size());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
......@@ -209,7 +209,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
for (size_t i = 0; i < places.size(); ++i) {
platform::CUDADeviceGuard guard(places[i]);
nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id);
dev_ctx[i].reset(new CUDADeviceContext(places[i]));
dev_ctx[i].reset(new phi::GPUContext(places[i]));
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
......
......@@ -45,7 +45,6 @@ namespace paddle {
namespace distributed {
using Place = paddle::platform::Place;
using CUDADeviceContext = paddle::platform::CUDADeviceContext;
class ProcessGroupNCCL : public ProcessGroup {
public:
......@@ -174,8 +173,7 @@ class ProcessGroupNCCL : public ProcessGroup {
std::unordered_map<std::string, std::vector<EventManager>> places_to_events_;
std::unordered_map<std::string,
std::vector<std::unique_ptr<CUDADeviceContext>>>
std::unordered_map<std::string, std::vector<std::unique_ptr<phi::GPUContext>>>
places_to_ctx_;
std::set<int> used_place_ids_;
......
......@@ -241,7 +241,7 @@ static void SplitTensorsWithType(const DeviceContext &context,
void EagerGroup::ConcatTensors(const platform::Place &place) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
auto *default_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place));
ConcatTensorsWithType(
*default_ctx, dense_tensors_, &dense_contents_, dtype_);
......@@ -264,7 +264,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
void EagerGroup::SplitTensors(const platform::Place &place) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
auto *default_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place));
SplitTensorsWithType(
*default_ctx, &dense_contents_, &dense_tensors_, dtype_);
......@@ -883,7 +883,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_);
if (platform::is_gpu_place(inner_place_)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
dev_ctx = static_cast<platform::CUDADeviceContext *>(
dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(inner_place_));
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......
......@@ -78,8 +78,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
VLOG(3) << "Loading data for GPU.";
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx =
dynamic_cast<const platform::CUDADeviceContext *>(pool.Get(place));
auto *dev_ctx = dynamic_cast<const phi::GPUContext *>(pool.Get(place));
auto gpu_place = place;
memory::Copy(gpu_place,
static_cast<void *>(input_tensor_ptr),
......
......@@ -119,8 +119,7 @@ void SerializeLodTensor(framework::Variable* var,
char* temp_ptr =
new char[tensor->numel() *
framework::DataTypeSize(tensor->dtype())]; // NOLINT
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
memory::Copy(
platform::CPUPlace(),
temp_ptr,
......@@ -168,8 +167,7 @@ void SerializeSelectedRows(framework::Variable* var,
char* temp_ptr =
new char[tensor->numel() *
framework::DataTypeSize(tensor->dtype())]; // NOLINT
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
memory::Copy(
platform::CPUPlace(),
temp_ptr,
......@@ -265,8 +263,7 @@ void DeserializeLodTensor(framework::Variable* var,
framework::DataTypeSize(tensor->dtype())]; // NOLINT
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len); // NOLINT
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
memory::Copy(place,
tensor_data,
platform::CPUPlace(),
......@@ -311,8 +308,7 @@ void DeserializeSelectedRows(
unsigned long data_len; // NOLINT
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
io_buffer_itr.copy_and_forward(temp_ptr, data_len);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
memory::Copy(place,
tensor_data,
platform::CPUPlace(),
......
......@@ -43,8 +43,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
std::vector<char> temp;
temp.resize(tensor->numel() * framework::DataTypeSize(tensor->dtype()));
char* temp_ptr = temp.data();
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
memory::Copy(platform::CPUPlace(),
temp_ptr,
tensor->place(),
......
......@@ -134,21 +134,20 @@ void ScaleAPI(const paddle::experimental::Tensor& x,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} else if (expected_kernel_place == paddle::platform::CUDAPlace()) {
auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
pool.Get(expected_kernel_place));
auto* dev_ctx =
dynamic_cast<phi::GPUContext*>(pool.Get(expected_kernel_place));
if (!dev_ctx) {
PADDLE_THROW(paddle::platform::errors::Fatal(
"Cannot convert device_context to CUDADeviceContext."
"This indicates backend mismatch."
"Pleas double check your expected place"));
}
ScaleDeviceDispatch<paddle::platform::CUDADeviceContext>(
*dense_tensor.get(),
*dev_ctx,
scale,
bias,
bias_after_scale,
dense_out.get());
ScaleDeviceDispatch<phi::GPUContext>(*dense_tensor.get(),
*dev_ctx,
scale,
bias,
bias_after_scale,
dense_out.get());
#endif
} else {
PADDLE_THROW(paddle::platform::errors::Fatal(
......
......@@ -38,8 +38,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
auto& place = dense_tensor->place();
if (paddle::platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::framework::details::tensor_check<
paddle::platform::CUDADeviceContext>(
paddle::framework::details::tensor_check<phi::GPUContext>(
api_name, tensor_name, *dense_tensor, place);
#else
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
......
......@@ -66,8 +66,7 @@ TEST(Benchmark, FluidScaleCUDA) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
auto stream = dev_ctx->stream();
paddle::memory::Copy(place,
mutable_x,
......@@ -121,8 +120,7 @@ TEST(Benchmark, FluidMatmulCUDA) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
auto stream = dev_ctx->stream();
auto* x_tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
......@@ -181,8 +179,7 @@ TEST(Benchmark, FluidMLPCUDA) {
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
auto stream = dev_ctx->stream();
std::vector<float> x_src_data(MLP_M * MLP_N, MLP_X_VAL);
......
......@@ -171,8 +171,7 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X,
if (place == paddle::platform::CUDAPlace()) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
auto stream = dev_ctx->stream();
paddle::memory::Copy(paddle::platform::CPUPlace(),
......@@ -204,8 +203,7 @@ static void FluidCheckGradTensorValue(
if (place == paddle::platform::CUDAPlace()) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
auto stream = dev_ctx->stream();
paddle::memory::Copy(paddle::platform::CPUPlace(),
......
......@@ -40,8 +40,8 @@ bool CompareGradTensorWithValue(const paddle::experimental::Tensor& target,
#ifdef PADDLE_WITH_CUDA
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
pool.Get(paddle::platform::CUDAPlace()));
auto* dev_ctx =
dynamic_cast<phi::GPUContext*>(pool.Get(paddle::platform::CUDAPlace()));
auto stream = dev_ctx->stream();
paddle::memory::Copy(paddle::platform::CPUPlace(),
......@@ -79,8 +79,8 @@ bool CompareTensorWithValue(const paddle::experimental::Tensor& target,
#ifdef PADDLE_WITH_CUDA
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
pool.Get(paddle::platform::CUDAPlace()));
auto* dev_ctx =
dynamic_cast<phi::GPUContext*>(pool.Get(paddle::platform::CUDAPlace()));
auto stream = dev_ctx->stream();
paddle::memory::Copy(paddle::platform::CPUPlace(),
......
......@@ -92,9 +92,8 @@ REGISTER_OP_WITHOUT_GRADIENT(
paddle::framework::OpKernelTestProtoAndCheckerMaker);
REGISTER_OP_CPU_KERNEL(test_op,
paddle::framework::TestKernel<phi::CPUContext, float>);
REGISTER_OP_CUDA_KERNEL(
test_op,
paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(test_op,
paddle::framework::TestKernel<phi::GPUContext, float>);
static void BuildVar(const std::string& param_name,
std::initializer_list<const char*> arguments,
......
......@@ -2809,7 +2809,7 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) {
MiniBatchGpuPack::MiniBatchGpuPack(const paddle::platform::Place& place,
const std::vector<UsedSlotInfo>& infos) {
place_ = place;
stream_ = dynamic_cast<platform::CUDADeviceContext*>(
stream_ = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
......@@ -2843,7 +2843,7 @@ MiniBatchGpuPack::~MiniBatchGpuPack() {}
void MiniBatchGpuPack::reset(const paddle::platform::Place& place) {
place_ = place;
stream_ = dynamic_cast<platform::CUDADeviceContext*>(
stream_ = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
ins_num_ = 0;
......
......@@ -89,7 +89,7 @@ void SlotRecordInMemoryDataFeed::FillSlotValueOffset(
const int float_slot_size,
const UsedSlotGpuType *used_slots) {
auto stream =
dynamic_cast<platform::CUDADeviceContext *>(
dynamic_cast<phi::GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(this->place_))
->stream();
FillSlotValueOffsetKernel<<<GET_BLOCKS(used_slot_num),
......@@ -168,7 +168,7 @@ void SlotRecordInMemoryDataFeed::CopyForTensor(
const int float_slot_size,
const UsedSlotGpuType *used_slots) {
auto stream =
dynamic_cast<platform::CUDADeviceContext *>(
dynamic_cast<phi::GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(this->place_))
->stream();
......
......@@ -103,8 +103,8 @@ struct CastDataType {
CastDataTypeFunctor<InType, OutType>());
#if defined(__NVCC__) || defined(__HIPCC__)
} else if (platform::is_gpu_place(in_.place())) {
platform::Transform<platform::CUDADeviceContext> trans;
auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
platform::Transform<phi::GPUContext> trans;
auto* context = static_cast<const phi::GPUContext*>(ctx_);
trans(*context,
in_begin,
in_end,
......
......@@ -19,7 +19,7 @@ limitations under the License. */
TEST(DataTypeTransform, GPUTransform) {
auto cpu_place = paddle::platform::CPUPlace();
auto gpu_place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(gpu_place);
phi::GPUContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
......
......@@ -105,7 +105,7 @@ struct TestBroadcastOpHandle {
for (int i = 0; i < count; ++i) {
auto p = p::CUDAPlace(i);
place_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p));
ctxs_.emplace_back(new phi::GPUContext(p));
}
nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
#else
......
......@@ -46,7 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
gc_(gc) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(place)) {
dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
dev_ctx_ = reinterpret_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place));
if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
platform::CUDADeviceGuard guard(place.device);
......
......@@ -81,7 +81,7 @@ class EagerDeletionOpHandle : public OpHandleBase {
GarbageCollector *gc_; // not own
std::vector<Variable *> vars_;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDADeviceContext *dev_ctx_{nullptr};
phi::GPUContext *dev_ctx_{nullptr};
gpuEvent_t event_{nullptr};
#endif
};
......
......@@ -58,7 +58,7 @@ struct TestGatherOpHandle {
for (int i = 0; i < count; ++i) {
auto p = p::CUDAPlace(i);
gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p));
ctxs_.emplace_back(new phi::GPUContext(p));
}
#else
PADDLE_THROW(
......
......@@ -367,8 +367,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
if (platform::is_gpu_place(tensor->place())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
tensor_check<platform::CUDADeviceContext>(
op_type, var_name, *tensor, place);
tensor_check<phi::GPUContext>(op_type, var_name, *tensor, place);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.",
......
......@@ -135,7 +135,7 @@ __global__ void CheckNanInfKernel(const T* value,
template <>
template <typename T>
void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
void TensorCheckerVisitor<phi::GPUContext>::apply(
typename std::enable_if<
std::is_floating_point<T>::value ||
std::is_same<T, ::paddle::platform::complex<float>>::value ||
......@@ -143,7 +143,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
const {
int print_num = 3;
auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(tensor_.place()));
int dev_id = tensor_.place().device;
PADDLE_ENFORCE_EQ(
......@@ -226,13 +226,13 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
}
template <>
void tensor_check<platform::CUDADeviceContext>(const std::string& op_type,
const std::string& var_name,
const framework::Tensor& tensor,
const platform::Place& place) {
void tensor_check<phi::GPUContext>(const std::string& op_type,
const std::string& var_name,
const framework::Tensor& tensor,
const platform::Place& place) {
std::call_once(init_multi_gpu_op_var_map_flag, InitMultiGPUOpVarMap);
TensorCheckerVisitor<platform::CUDADeviceContext> vistor(
TensorCheckerVisitor<phi::GPUContext> vistor(
op_type, var_name, tensor, place);
VisitDataType(framework::TransToProtoVarType(tensor.dtype()), vistor);
}
......
......@@ -184,8 +184,7 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
dev_ctx.second->Wait();
}
} else {
auto stream =
static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
auto stream = static_cast<phi::GPUContext *>(waited_ctx)->stream();
for (auto &ev : events_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
......@@ -224,8 +223,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto stream =
static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
->stream();
static_cast<phi::GPUContext *>(dev_ctxes_.at(place))->stream();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
......@@ -254,8 +252,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto stream =
static_cast<platform::CUDADeviceContext *>(pool.Get(place))
->stream();
static_cast<phi::GPUContext *>(pool.Get(place))->stream();
platform::GpuStreamSync(stream);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
......@@ -277,7 +274,7 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
if (in_var_handle) {
if (platform::is_gpu_place(in_var_handle->place())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto stream = static_cast<platform::CUDADeviceContext *>(
auto stream = static_cast<phi::GPUContext *>(
dev_ctxes_.at(in_var_handle->place()))
->stream();
#ifdef PADDLE_WITH_HIP
......@@ -318,8 +315,8 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
if (!events_.empty()) { // Use event
for (auto &p : dev_ctxes_) {
auto dev_id = p.first.device;
auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second);
VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
auto *cuda_dev_ctx = static_cast<phi::GPUContext *>(p.second);
VLOG(10) << "phi::GPUContext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
......@@ -339,7 +336,7 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p,
callback();
} else {
auto *ctx = dev_ctxes_.at(p);
auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
auto *cuda_ctx = static_cast<phi::GPUContext *>(ctx);
cuda_ctx->RecordEvent(events_.at(p.device), callback);
}
#else
......
......@@ -69,7 +69,7 @@ struct TestReduceOpHandle {
for (int i = 0; i < count; ++i) {
auto p = p::CUDAPlace(i);
gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p));
ctxs_.emplace_back(new p::phi::GPUContext(p));
}
nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
#else
......
......@@ -77,7 +77,7 @@ struct ScaleLossGradFunctor {
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
OutT cast_coeff = static_cast<OutT>(coeff_);
auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
auto stream = static_cast<phi::GPUContext *>(ctx_)->stream();
memory::Copy(place_,
out_data,
platform::CPUPlace(),
......
......@@ -151,7 +151,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
const int hidden_size,
const int expand_embed_dim,
const int64_t total_length) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
auto stream = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
......@@ -235,7 +235,7 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
const int64_t* gpu_len,
int slot_num,
int total_len) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
auto stream = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
#ifdef PADDLE_WITH_HIP
......@@ -265,7 +265,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
const int expand_embed_dim,
const int64_t total_length,
const int batch_size) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
auto stream = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
auto slot_lengths_lod = slot_lengths;
......
......@@ -223,10 +223,10 @@ class AfsManager {
delete read_stream;
}
int PopenBidirectionalInternal(const char* command,
FILE*& fp_read, // NOLINT
FILE*& fp_write,
pid_t& pid, // NOLINT
bool read, // NOLINT
FILE*& fp_read, // NOLINT
FILE*& fp_write, // NOLINT
pid_t& pid, // NOLINT
bool read, // NOLINT
bool write) {
std::lock_guard<std::mutex> g(g_flock);
int fd_read[2];
......@@ -440,10 +440,9 @@ class BoxWrapper {
std::vector<gpuStream_t*> stream_list;
for (int i = 0; i < platform::GetGPUDeviceCount(); ++i) {
VLOG(3) << "before get context i[" << i << "]";
platform::CUDADeviceContext* context =
dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(
platform::CUDAPlace(i)));
phi::GPUContext* context = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(
platform::CUDAPlace(i)));
stream_list_[i] = context->stream();
stream_list.push_back(&stream_list_[i]);
}
......
......@@ -300,7 +300,7 @@ void AccessorWrapper<GPUAccessor>::CopyForPullImpl(
const int64_t total_length,
int* gpu_dim,
int feature_value_size) {
auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
auto stream = dynamic_cast<phi::GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
......@@ -333,7 +333,7 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
size_t grad_value_size,
std::vector<int>& slot_vector,
std::vector<int>& slot_mf_dim_vector) {
auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
auto stream = dynamic_cast<phi::GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
auto slot_lengths_lod = slot_lengths;
......
......@@ -90,7 +90,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
const int64_t* gpu_len,
int slot_num,
int total_len) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
auto stream = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>(
......
......@@ -78,14 +78,12 @@ DefaultStreamGarbageCollector::DefaultStreamGarbageCollector(
: GarbageCollector(place, max_memory_size) {}
void DefaultStreamGarbageCollector::Wait() const {
static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
->WaitStreamCallback();
static_cast<phi::GPUContext *>(this->dev_ctx_)->WaitStreamCallback();
}
void DefaultStreamGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
->AddStreamCallback(callback);
static_cast<phi::GPUContext *>(this->dev_ctx_)->AddStreamCallback(callback);
}
StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
......
......@@ -48,8 +48,7 @@ void SetMicroId(paddle::framework::Scope* scope,
char* temp_ptr = temp.data();
float* temp_ptr_float = reinterpret_cast<float*>(temp_ptr);
temp_ptr_float[0] = micro_id;
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(*dev_ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(*dev_ctx).stream();
memory::Copy(
place,
tensor_data,
......
......@@ -514,7 +514,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
}
}
#ifdef PADDLE_WITH_CUDA
auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
auto* dev_ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place));
PADDLE_ENFORCE_GPU_SUCCESS(
cudaEventRecord(context->event_, dev_ctx->stream()));
......
......@@ -229,7 +229,7 @@ void TestMainImpl(std::string func_name,
device_code.SetWorkloadPerThread(1);
device_code.Launch(n, &args);
auto* dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext*>(
auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place));
dev_ctx->Wait();
......
......@@ -38,7 +38,7 @@ void CopyToCPUHelper(std::vector<T> *cpu_,
size_t *gpu_memory_size_) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// COPY GPU Data To CPU
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
auto *dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get((*gpu_)->place()));
auto stream = dev_ctx->stream();
void *src = (*gpu_)->ptr();
......@@ -63,7 +63,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
*gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T)
(*gpu_) = memory::Alloc(place, *gpu_memory_size_);
void *dst = (*gpu_)->ptr();
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
auto *dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream();
paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(),
......
......@@ -38,7 +38,7 @@ static __global__ void multiply_10(int* ptr) {
}
gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
return reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
return reinterpret_cast<const phi::GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
}
......
......@@ -854,9 +854,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
platform::RecordEvent record(
"RecordStreamForGC", platform::TracerEventType::UserDefined, 10);
gpuStream_t stream = reinterpret_cast<const platform::CUDADeviceContext&>(
instr.DeviceContext())
.stream();
gpuStream_t stream =
reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext()).stream();
auto TensorRecordStream = [&stream](Tensor& tensor) {
auto allocation = tensor.Holder();
if (allocation == nullptr) {
......
......@@ -236,9 +236,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
paddle::framework::OpKernelTest<phi::CPUContext, float>);
REGISTER_OP_CUDA_KERNEL(
op_with_kernel,
paddle::framework::OpKernelTest<paddle::platform::CUDADeviceContext,
float>);
op_with_kernel, paddle::framework::OpKernelTest<phi::GPUContext, float>);
TEST(OperatorRegistrar, CPU) {
paddle::framework::proto::OpDesc op_desc;
......@@ -263,9 +261,9 @@ TEST(OperatorRegistrar, CUDA) {
}
static int op_test_value = 0;
using paddle::platform::CUDADeviceContext;
using paddle::platform::DeviceContext;
using phi::CPUContext;
using phi::GPUContext;
namespace paddle {
namespace framework {
......@@ -301,7 +299,7 @@ class OpMultiKernelTest<CPUContext, T> : public paddle::framework::OpKernel<T> {
};
template <typename T>
class OpMultiKernelTest<CUDADeviceContext, T>
class OpMultiKernelTest<phi::GPUContext, T>
: public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const {
......@@ -325,7 +323,7 @@ class OpMultiKernelTest2<CPUContext, T>
};
template <typename T>
class OpMultiKernelTest2<CUDADeviceContext, T>
class OpMultiKernelTest2<phi::GPUContext, T>
: public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const {
......@@ -351,12 +349,12 @@ REGISTER_OP_KERNEL(
op_with_multi_kernel,
CUDA,
paddle::platform::CUDAPlace,
paddle::framework::OpMultiKernelTest<CUDADeviceContext, float>);
paddle::framework::OpMultiKernelTest<phi::GPUContext, float>);
REGISTER_OP_KERNEL(
op_with_multi_kernel,
CUDNN,
paddle::platform::CUDAPlace,
paddle::framework::OpMultiKernelTest2<CUDADeviceContext, float>);
paddle::framework::OpMultiKernelTest2<phi::GPUContext, float>);
TEST(OperatorRegistrar, OpWithMultiKernel) {
paddle::framework::proto::OpDesc op_desc;
......
......@@ -416,13 +416,12 @@ class ExecutionContext {
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const inline platform::CUDADeviceContext& cuda_device_context() const {
const inline phi::GPUContext& cuda_device_context() const {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()),
true,
platform::errors::PreconditionNotMet(
"Current device context place is not GPUPlace."));
return *reinterpret_cast<const platform::CUDADeviceContext*>(
&device_context_);
return *reinterpret_cast<const phi::GPUContext*>(&device_context_);
}
#endif
......
......@@ -863,12 +863,12 @@ void ParallelExecutor::BCastParamsToDevices(
nccl_ctxs->WaitAll();
} else {
auto src_place = member_->places_[0];
auto src_dev_ctx = static_cast<platform::CUDADeviceContext *>(
auto src_dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(src_place));
auto sizeof_dtype = framework::SizeOfType(dtype) * numel;
for (size_t i = 1; i < member_->places_.size(); ++i) {
auto dst_place = member_->places_[i];
auto dst_dev_ctx = static_cast<platform::CUDADeviceContext *>(
auto dst_dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(dst_place));
src_dev_ctx->Wait();
dst_dev_ctx->Wait();
......@@ -1492,8 +1492,8 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
global_scope, member_->places_);
auto &pool = platform::DeviceContextPool::Instance();
for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
pool.Get(member_->places_[dev_id]));
auto *dev_ctx =
static_cast<phi::GPUContext *>(pool.Get(member_->places_[dev_id]));
auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]);
dev_ctx->set_nccl_comm(nccl_ctx.comm());
}
......
......@@ -72,7 +72,7 @@ struct ConvertToPhiContext<phi::CPUContext> {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
struct ConvertToPhiContext<platform::CUDADeviceContext> {
struct ConvertToPhiContext<phi::GPUContext> {
using TYPE = phi::GPUContext;
};
#endif
......
......@@ -261,8 +261,7 @@ void TensorCopyImpl(const TENSOR& src,
"place is %s, context place is %s.",
src_gpu_place,
ctx_gpu_place));
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
......@@ -284,8 +283,7 @@ void TensorCopyImpl(const TENSOR& src,
"destination place is %s, context place is %s.",
dst_gpu_place,
ctx_gpu_place));
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
}
else if (platform::is_gpu_place(src_place) && // NOLINT
......@@ -308,8 +306,7 @@ void TensorCopyImpl(const TENSOR& src,
"device context GPU number is %d.",
src_gpu_place.device,
ctx_gpu_place.device));
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
memory::Copy(
dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
}
......@@ -333,8 +330,7 @@ void TensorCopyImpl(const TENSOR& src,
"device context GPU number is %d.",
dst_gpu_place.device,
ctx_gpu_place.device));
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
memory::Copy(
dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
}
......@@ -349,8 +345,7 @@ void TensorCopyImpl(const TENSOR& src,
platform::errors::PreconditionNotMet(
"Context place error, excepted GPUPlace, but actually %s.",
ctx_place));
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
if (platform::is_same_place(src_place, dst_place)) {
memory::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
......@@ -1076,8 +1071,7 @@ void TensorToStream(std::ostream& os,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(dev_ctx);
auto& gpu_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
......@@ -1482,13 +1476,12 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
platform::CUDAPlace(dl_tensor.device.device_id);
dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place);
memory::Copy(
dst_place,
dst_ptr,
src_place,
src_ptr,
size,
reinterpret_cast<const platform::CUDADeviceContext&>(*ctx).stream());
memory::Copy(dst_place,
dst_ptr,
src_place,
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(*ctx).stream());
}
#endif
#ifdef PADDLE_WITH_XPU
......
......@@ -164,13 +164,12 @@ void TensorFromArray(const T* src,
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy(
dst_place,
dst_ptr,
src_place,
src_ptr,
size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
memory::Copy(dst_place,
dst_ptr,
src_place,
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
......@@ -242,13 +241,12 @@ void TensorFromVector(const std::vector<T>& src,
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy(
dst_place,
dst_ptr,
src_place,
src_ptr,
size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
memory::Copy(dst_place,
dst_ptr,
src_place,
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
......@@ -340,13 +338,12 @@ inline void TensorFromVector(const std::vector<bool>& src,
}
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy(
dst_place,
dst_ptr,
src_place,
src_ptr,
size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
memory::Copy(dst_place,
dst_ptr,
src_place,
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
......@@ -444,13 +441,12 @@ void TensorToVector(const Tensor& src,
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy(
dst_place,
dst_ptr,
src.place(),
src_ptr,
size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
memory::Copy(dst_place,
dst_ptr,
src.place(),
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
}
#endif
#if defined(PADDLE_WITH_XPU)
......@@ -503,13 +499,12 @@ inline void TensorToVector(const Tensor& src,
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy(
dst_place,
dst_ptr,
src.place(),
src_ptr,
size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
memory::Copy(dst_place,
dst_ptr,
src.place(),
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
}
#endif
#if defined(PADDLE_WITH_XPU)
......
......@@ -73,7 +73,7 @@ TEST(TensorCopy, Tensor) {
// CPU Tensor to GPU Tensor
auto gpu_place = new platform::CUDAPlace(0);
platform::CUDADeviceContext gpu_ctx(*gpu_place);
phi::GPUContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
......@@ -170,7 +170,7 @@ TEST(TensorFromVector, Tensor) {
// Copy to GPUTensor
gpu_tensor.Resize(phi::make_ddim({3, 3}));
auto gpu_place = new paddle::platform::CUDAPlace();
paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
phi::GPUContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
......@@ -238,7 +238,7 @@ TEST(TensorToVector, Tensor) {
std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
paddle::framework::Tensor gpu_tensor;
paddle::platform::CUDAPlace place;
paddle::platform::CUDADeviceContext gpu_ctx(place);
phi::GPUContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream())
.get());
......@@ -255,22 +255,20 @@ TEST(TensorToVector, Tensor) {
#endif
}
TEST(TensorToVector, Tensor_bool) {
{
paddle::framework::Tensor src;
bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
for (int i = 0; i < 3 * 3; ++i) {
src_ptr[i] = static_cast<bool>(i % 2);
}
TEST(TensorToVector, Tensor_bool){{paddle::framework::Tensor src;
bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
for (int i = 0; i < 3 * 3; ++i) {
src_ptr[i] = static_cast<bool>(i % 2);
}
paddle::platform::CPUPlace place;
std::vector<bool> dst;
paddle::framework::TensorToVector<bool>(src, &dst);
paddle::platform::CPUPlace place;
std::vector<bool> dst;
paddle::framework::TensorToVector<bool>(src, &dst);
for (int i = 0; i < 3 * 3; ++i) {
EXPECT_EQ(src_ptr[i], dst[i]);
}
for (int i = 0; i < 3 * 3; ++i) {
EXPECT_EQ(src_ptr[i], dst[i]);
}
} // namespace framework
#ifdef PADDLE_WITH_CUDA
{
......@@ -287,7 +285,7 @@ TEST(TensorToVector, Tensor_bool) {
};
paddle::framework::Tensor gpu_tensor;
paddle::platform::CUDAPlace place;
paddle::platform::CUDADeviceContext gpu_ctx(place);
phi::GPUContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream())
.get());
......@@ -328,7 +326,7 @@ TEST(TensorToVector, Tensor_bool) {
}
}
#endif
}
} // namespace paddle
TEST(TensorFromDLPack, Tensor) {
{
......@@ -525,7 +523,7 @@ TEST(Tensor, FromAndToStream) {
Tensor dst_tensor;
auto gpu_place = new platform::CUDAPlace();
platform::CUDADeviceContext gpu_ctx(*gpu_place);
phi::GPUContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
......
......@@ -95,7 +95,7 @@ static void AllReduce(const phi::SelectedRows &src,
auto dtype = framework::TransToProtoVarType(src_tensor.dtype());
auto nccl_dtype = platform::ToNCCLDataType(dtype);
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
auto *dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place));
bool use_calc_stream = (dev_ctx->stream() == stream);
......@@ -220,7 +220,7 @@ void AllReduce(const framework::Variable &src,
int ring_id,
bool use_calc_stream) {
const auto &place = GetVarPlace(src);
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
auto *dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place));
platform::NCCLComm *comm =
platform::NCCLCommContext::Instance().Get(ring_id, place);
......
......@@ -122,10 +122,9 @@ class TensorAddFunctor
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void operator()(const platform::CUDAPlace& place) const {
platform::CUDADeviceContext* ctx =
dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(*ctx);
phi::GPUContext* ctx = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}
#else
......@@ -433,7 +432,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
if (data_type == framework::proto::VarType::FP16) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return TensorAddImpl<platform::CUDADeviceContext, platform::float16>(
return TensorAddImpl<phi::GPUContext, platform::float16>(
src_tensor, dst_tensor, place);
#else
PADDLE_THROW(platform::errors::Unimplemented(
......@@ -450,7 +449,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
if (data_type == framework::proto::VarType::BF16) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return TensorAddImpl<platform::CUDADeviceContext, platform::bfloat16>(
return TensorAddImpl<phi::GPUContext, platform::bfloat16>(
src_tensor, dst_tensor, place);
#else
PADDLE_THROW(platform::errors::Unimplemented(
......@@ -499,8 +498,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (paddle::platform::is_gpu_place(place)) {
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, float);
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, double);
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float);
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double);
} else {
#endif
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float);
......@@ -551,8 +550,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(place)) {
PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, float);
PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, double);
PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float);
PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double);
} else {
#endif
PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float);
......@@ -614,8 +613,8 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (paddle::platform::is_gpu_place(place)) {
PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, float);
PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, double);
PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float);
PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double);
} else {
#endif
PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, float);
......
......@@ -85,7 +85,7 @@ void NCCLParallelContext::Init() {
VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
<< " ring id: " << ring_id;
// it will assign nccl_comm in CUDADeviceContext within ring_id
// it will assign nccl_comm in phi::GPUContext within ring_id
platform::NCCLCommContext::Instance().CreateComm(&nccl_ids[ring_id],
strategy_.nranks_,
strategy_.local_rank_,
......@@ -119,7 +119,7 @@ void NCCLParallelContext::InitWithRingID(int ring_id) {
VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
<< " ring id: " << ring_id;
// it will assign nccl_comm in CUDADeviceContext within ring_id
// it will assign nccl_comm in phi::GPUContext within ring_id
platform::NCCLCommContext::Instance().CreateComm(
&nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id);
......@@ -177,7 +177,7 @@ void NCCLParallelContext::WaitCompute(int ring_id) {
ring_id,
compute_events_.size()));
auto compute_stream = static_cast<platform::CUDADeviceContext *>(
auto compute_stream = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place_))
->stream();
auto comm_stream =
......@@ -207,7 +207,7 @@ void NCCLParallelContext::WaitComm(int ring_id) {
ring_id,
comm_events_.size()));
auto compute_stream = static_cast<platform::CUDADeviceContext *>(
auto compute_stream = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place_))
->stream();
auto comm_stream =
......@@ -225,7 +225,7 @@ void NCCLParallelContext::WaitComm(int ring_id) {
}
void NCCLParallelContext::SynchronizeCompute() {
auto *compute_dev_ctx = static_cast<platform::CUDADeviceContext *>(
auto *compute_dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place_));
compute_dev_ctx->Wait();
}
......
......@@ -283,11 +283,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
auto place = context.GetPlace();
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
ConcatTensorsWithType(
static_cast<const platform::CUDADeviceContext &>(context),
dense_tensors_,
&dense_contents_,
dtype_);
ConcatTensorsWithType(static_cast<const phi::GPUContext &>(context),
dense_tensors_,
&dense_contents_,
dtype_);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't concat grad tensors since it's not compiled with NCCL,"
......@@ -344,11 +343,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
auto place = context.GetPlace();
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
SplitTensorsWithType(
static_cast<const platform::CUDADeviceContext &>(context),
&dense_contents_,
&dense_tensors_,
dtype_);
SplitTensorsWithType(static_cast<const phi::GPUContext &>(context),
&dense_contents_,
&dense_tensors_,
dtype_);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't split grad tensor since it's not compiled with NCCL,"
......
......@@ -27,13 +27,10 @@ void Group::DivNRanks(framework::Tensor *tensor,
"Unsupport BF16 in DataParallel for now"));
}
framework::VisitDataTypeForHIP(
dtype_,
DivNRanksForAllReduce<platform::CUDADeviceContext>(
tensor, nranks, context));
dtype_, DivNRanksForAllReduce<phi::GPUContext>(tensor, nranks, context));
#else
framework::VisitDataType(dtype_,
DivNRanksForAllReduce<platform::CUDADeviceContext>(
tensor, nranks, context));
framework::VisitDataType(
dtype_, DivNRanksForAllReduce<phi::GPUContext>(tensor, nranks, context));
#endif
}
#endif
......
......@@ -39,7 +39,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) {
void AllReduceByStream(int local_rank, int device_id) {
int data_size = 32;
const auto& place = platform::CUDAPlace(device_id);
platform::CUDADeviceContext ctx(place);
phi::GPUContext ctx(place);
// heter_parallel_ctx
imperative::HeterParallelContext hpc(GetStrategy(local_rank), device_id);
......
......@@ -78,7 +78,7 @@ void Broadcast(int local_rank, int device_id) {
int data_size = 4;
float test_data = 7;
const auto& place = platform::CUDAPlace(device_id);
platform::CUDADeviceContext ctx(place);
phi::GPUContext ctx(place);
imperative::NCCLParallelContext npc(GetStrategy(local_rank), place);
......
......@@ -194,8 +194,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt,
"Only one choice can be made between CPU and XPU."));
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place));
auto dst_gpu_place = place;
memory::Copy(dst_gpu_place,
static_cast<void *>(input_ptr),
......@@ -283,7 +282,7 @@ bool AnalysisPredictor::Init(
// NOTE: If the external_stream equals to global_device_contexts's stream,
// then fallback.
auto global_stream =
static_cast<platform::CUDADeviceContext *>(
static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place_))
->stream();
if (predictor_stream_ != global_stream) {
......@@ -1658,8 +1657,7 @@ void AnalysisPredictor::CollectShapeRangeInfo() {
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
auto gpu_place = place_;
auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
pool.Get(gpu_place));
auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(gpu_place));
#ifdef PADDLE_WITH_HIP
hipStreamSynchronize(dev_ctx->stream());
#else
......@@ -2331,8 +2329,7 @@ void InternalUtils::SyncStream(paddle_infer::Predictor *p) {
auto *pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
auto *dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext *>(
pool.Get(pred->place_));
auto *dev_ctx = reinterpret_cast<phi::GPUContext *>(pool.Get(pred->place_));
cudaStreamSynchronize(dev_ctx->stream());
#endif
}
......
......@@ -248,8 +248,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place_));
auto dst_gpu_place = place_;
memory::Copy(dst_gpu_place,
static_cast<void *>(input_ptr),
......
......@@ -158,8 +158,7 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
paddle::platform::CUDAPlace gpu_place(dst.device_);
auto* dev_ctx = static_cast<const paddle::platform::CUDADeviceContext*>(
pool.Get(gpu_place));
auto* dev_ctx = static_cast<const phi::GPUContext*>(pool.Get(gpu_place));
if (src.place() == PlaceType::kCPU) {
paddle::memory::Copy(gpu_place,
......
......@@ -139,13 +139,12 @@ void MemoryCopyAsync(const platform::Place& dst_place,
} else if (platform::is_gpu_place(dst_place) &&
platform::is_gpu_place(src_place)) {
auto gpu_place = src_place;
memory::Copy(
gpu_place,
dst_data,
gpu_place,
src_data,
size,
static_cast<const platform::CUDADeviceContext&>(ctx).stream());
memory::Copy(gpu_place,
dst_data,
gpu_place,
src_data,
size,
static_cast<const phi::GPUContext&>(ctx).stream());
}
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
......
......@@ -74,7 +74,7 @@ void make_fake_model(std::string* model, std::string* param) {
framework::Scope scope;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
phi::GPUContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
......
......@@ -118,8 +118,7 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) {
platform::GpuStreamSync(
static_cast<const platform::CUDADeviceContext&>(ctx).stream());
platform::GpuStreamSync(static_cast<const phi::GPUContext&>(ctx).stream());
}
#endif
std::vector<float> result;
......
......@@ -68,7 +68,7 @@ TEST(EngineIOConverterTester, DefaultCPU) {
TEST(EngineIOConverterTester, DefaultGPU) {
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
phi::GPUContext ctx(place);
IOConverterTester(ctx);
}
......
......@@ -124,7 +124,7 @@ class TRTConvertValidation {
}
void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
platform::CUDADeviceContext ctx(place_);
phi::GPUContext ctx(place_);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
......@@ -172,7 +172,7 @@ class TRTConvertValidation {
"But received batch_size:%d, max_batch_size_:%d",
batch_size,
max_batch_size_));
platform::CUDADeviceContext ctx(place_);
phi::GPUContext ctx(place_);
op_->Run(scope_, place_);
cudaStreamSynchronize(stream_);
std::vector<std::string> input_output_names;
......
......@@ -347,11 +347,11 @@ int QkvToContextPluginDynamic::enqueue(
TransposeQKV(
batch, seq_len, head_size_, head_number_, input0_data, tptr, stream);
auto *device_ctx = static_cast<platform::CUDADeviceContext *>(
auto *device_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(
platform::CUDAPlace(device_id)));
const platform::CUDADeviceContext &dev_ctx = *device_ctx;
const phi::GPUContext &dev_ctx = *device_ctx;
operators::math::MultiHeadGPUComputeFunctor<float> multihead_compute_func;
multihead_compute_func(dev_ctx,
batch,
......@@ -403,7 +403,7 @@ int QkvToContextPluginDynamic::enqueue(
TransposeQKV(
batch, seq_len, head_size_, head_number_, input0_data, tptr, stream);
auto *device_ctx = static_cast<platform::CUDADeviceContext *>(
auto *device_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(
platform::CUDAPlace(device_id)));
......@@ -414,7 +414,7 @@ int QkvToContextPluginDynamic::enqueue(
apply_scale<<<blocks, threads, 0, stream>>>(
tptr, static_cast<half>(scale_), n_q);
const platform::CUDADeviceContext &dev_ctx = *device_ctx;
const phi::GPUContext &dev_ctx = *device_ctx;
operators::math::MultiHeadGPUComputeFunctor<half> multihead_compute_func;
multihead_compute_func(dev_ctx,
batch,
......
......@@ -34,7 +34,7 @@ namespace tensorrt {
class TensorRTDynamicEngineTest : public ::testing::Test {
protected:
void SetUp() override {
ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get());
......@@ -94,7 +94,7 @@ class TensorRTDynamicEngineTest : public ::testing::Test {
framework::Tensor input_;
framework::Tensor output_;
TensorRTEngine *engine_;
platform::CUDADeviceContext *ctx_;
phi::GPUContext *ctx_;
};
TEST_F(TensorRTDynamicEngineTest, test_spmm) {
......@@ -199,7 +199,7 @@ TEST_F(TensorRTDynamicEngineTest, test_spmm) {
class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
protected:
void SetUp() override {
ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get());
......@@ -279,7 +279,7 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
std::vector<framework::Tensor> inputs_;
std::vector<framework::Tensor> outputs_;
TensorRTEngine *engine_;
platform::CUDADeviceContext *ctx_;
phi::GPUContext *ctx_;
};
TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) {
......
......@@ -26,7 +26,7 @@ namespace tensorrt {
class TensorRTEngineTest : public ::testing::Test {
protected:
void SetUp() override {
ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get());
......@@ -69,7 +69,7 @@ class TensorRTEngineTest : public ::testing::Test {
framework::Tensor input_;
framework::Tensor output_;
TensorRTEngine *engine_;
platform::CUDADeviceContext *ctx_;
phi::GPUContext *ctx_;
};
TEST_F(TensorRTEngineTest, add_layer) {
......
......@@ -44,7 +44,7 @@ TEST(BestFitAllocator, concurrent_cuda) {
std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));
platform::CUDAPlace gpu(0);
platform::CUDADeviceContext dev_ctx(gpu);
phi::GPUContext dev_ctx(gpu);
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu, dev_ctx.stream())
.get());
......@@ -64,8 +64,7 @@ TEST(BestFitAllocator, concurrent_cuda) {
size_t* data = reinterpret_cast<size_t*>(allocation->ptr());
ForEachFill fill(data);
platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
allocate_size);
platform::ForRange<phi::GPUContext> for_range(dev_ctx, allocate_size);
for_range(fill);
memory::Copy(platform::CPUPlace(),
......
......@@ -29,53 +29,51 @@ namespace memory {
namespace allocation {
/**
* CUDADeviceContextAllocation is a wrapper of the underbeneath allocation.
* CUDADeviceContextAllocation adds a CUDA stream callback for the underbeneath
* allocation so that CUDADeviceContextAllocation can be used in a CUDA stream
* GPUContextAllocation is a wrapper of the underbeneath allocation.
* GPUContextAllocation adds a CUDA stream callback for the underbeneath
* allocation so that GPUContextAllocation can be used in a CUDA stream
* which deletes allocation in the callback.
*/
class CUDADeviceContextAllocation : public Allocation {
class GPUContextAllocation : public Allocation {
public:
explicit CUDADeviceContextAllocation(DecoratedAllocationPtr allocation)
explicit GPUContextAllocation(DecoratedAllocationPtr allocation)
: Allocation(allocation->ptr(),
allocation->base_ptr(),
allocation->size(),
allocation->place()),
underlying_allocation_(std::move(allocation)) {}
~CUDADeviceContextAllocation() {
~GPUContextAllocation() {
PADDLE_ENFORCE_NOT_NULL(
dev_ctx_,
platform::errors::PreconditionNotMet(
"Device context is not set for CUDADeviceContextAllocation"));
"Device context is not set for GPUContextAllocation"));
auto *p_allocation = underlying_allocation_.release();
VLOG(4) << "Adding callback to delete CUDADeviceContextAllocation at "
VLOG(4) << "Adding callback to delete GPUContextAllocation at "
<< p_allocation;
dev_ctx_->AddStreamCallback([p_allocation] {
VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation;
VLOG(4) << "Delete GPUContextAllocation at " << p_allocation;
Allocator::AllocationDeleter(p_allocation);
});
}
void SetCUDADeviceContext(const platform::CUDADeviceContext *dev_ctx) {
dev_ctx_ = dev_ctx;
}
void SetGPUContext(const phi::GPUContext *dev_ctx) { dev_ctx_ = dev_ctx; }
private:
DecoratedAllocationPtr underlying_allocation_;
const platform::CUDADeviceContext *dev_ctx_{nullptr};
const phi::GPUContext *dev_ctx_{nullptr};
};
/**
* CUDADeviceContextAllocator will allocate a CUDADeviceContextAllocation
* GPUContextAllocator will allocate a GPUContextAllocation
* after waiting for a self-created event on the default stream. It does so to
* let the non-default stream be able to allocate GPU memory which will be
* released by stream callback
*/
class CUDADeviceContextAllocator : public Allocator {
class GPUContextAllocator : public Allocator {
public:
explicit CUDADeviceContextAllocator(platform::CUDAPlace place,
gpuStream_t default_stream)
explicit GPUContextAllocator(platform::CUDAPlace place,
gpuStream_t default_stream)
: place_(place), default_stream_(default_stream) {
platform::CUDADeviceGuard guard(place_.device);
#ifdef PADDLE_WITH_HIP
......@@ -87,7 +85,7 @@ class CUDADeviceContextAllocator : public Allocator {
#endif
}
~CUDADeviceContextAllocator() {
~GPUContextAllocator() {
if (event_) {
platform::CUDADeviceGuard guard(place_.device);
#ifdef PADDLE_WITH_HIP
......@@ -103,9 +101,9 @@ class CUDADeviceContextAllocator : public Allocator {
PADDLE_ENFORCE_NOT_NULL(
default_stream_,
platform::errors::PreconditionNotMet(
"Default stream is not set for CUDADeviceContextAllocator"));
"Default stream is not set for GPUContextAllocator"));
platform::CUDADeviceGuard guard(place_.device);
auto allocation = new CUDADeviceContextAllocation(
auto allocation = new GPUContextAllocation(
static_unique_ptr_cast<Allocation>(memory::Alloc(place_, size)));
// Wait for the event on stream
#ifdef PADDLE_WITH_HIP
......@@ -127,20 +125,20 @@ class CUDADeviceContextAllocator : public Allocator {
};
/**
* CUDADeviceContextAllocatorPool is a singletion stores mapping from
* CUDAPlace(s) to std::shared_ptr<CUDADeviceContextAllocator>. When a
* CUDADeviceContext's compute stream isn't default stream, it can call this
* GPUContextAllocatorPool is a singletion stores mapping from
* CUDAPlace(s) to std::shared_ptr<GPUContextAllocator>. When a
* phi::GPUContext's compute stream isn't default stream, it can call this
* class to allocate GPU memory which will be released by a callback after
* stream execution.
*/
class CUDADeviceContextAllocatorPool {
class GPUContextAllocatorPool {
public:
static CUDADeviceContextAllocatorPool &Instance() {
static CUDADeviceContextAllocatorPool pool;
static GPUContextAllocatorPool &Instance() {
static GPUContextAllocatorPool pool;
return pool;
}
AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) {
AllocationPtr Alloc(const phi::GPUContext &dev_ctx, size_t size) {
auto iter =
allocators_.find(platform::CUDAPlace(dev_ctx.GetPlace().GetDeviceId()));
PADDLE_ENFORCE_NE(
......@@ -149,25 +147,25 @@ class CUDADeviceContextAllocatorPool {
platform::errors::NotFound("No allocator found for CUDAPlace."));
auto &allocator = iter->second;
AllocationPtr allocation = allocator->Allocate(size);
static_cast<CUDADeviceContextAllocation *>(allocation.get())
->SetCUDADeviceContext(&dev_ctx);
static_cast<GPUContextAllocation *>(allocation.get())
->SetGPUContext(&dev_ctx);
return allocation;
}
private:
CUDADeviceContextAllocatorPool() {
GPUContextAllocatorPool() {
std::vector<int> devices = platform::GetSelectedDevices();
for (int i : devices) {
auto place = platform::CUDAPlace(i);
auto compute_stream =
platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
auto allocator = std::shared_ptr<CUDADeviceContextAllocator>(
new CUDADeviceContextAllocator(place, compute_stream));
auto allocator = std::shared_ptr<GPUContextAllocator>(
new GPUContextAllocator(place, compute_stream));
allocators_.insert(make_pair(place, allocator));
}
}
std::map<platform::CUDAPlace, std::shared_ptr<CUDADeviceContextAllocator>>
std::map<platform::CUDAPlace, std::shared_ptr<GPUContextAllocator>>
allocators_;
};
......
......@@ -37,7 +37,7 @@ const int NUM_STREAMS = 8;
const int N = 2;
const float DELTA = 1e-1;
using CudaDevCtxVec = std::vector<std::unique_ptr<platform::CUDADeviceContext>>;
using CudaDevCtxVec = std::vector<std::unique_ptr<phi::GPUContext>>;
__global__ void kernel(float *x, int n) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
......@@ -65,7 +65,7 @@ void CheckKernelOutput(float *x, int n) {
void MultiStreamCompute(float **data,
float **second_data,
const platform::CUDADeviceContext &ctx) {
const phi::GPUContext &ctx) {
// multi-streams
AllocationPtr allocation_ptr = Alloc(ctx, N * sizeof(float));
EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
......@@ -88,7 +88,7 @@ void MultiStreamCompute(float **data,
#endif
}
TEST(Malloc, CUDADeviceContextMultiStream) {
TEST(Malloc, GPUContextMultiStream) {
auto place = platform::CUDAPlace(0);
platform::SetDeviceId(0);
......@@ -110,8 +110,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
main_stream_alloc_ptr.reset();
for (int i = 0; i < NUM_STREAMS; ++i) {
auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place));
auto ctx = std::unique_ptr<phi::GPUContext>(new phi::GPUContext(place));
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
......@@ -143,7 +142,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
}
}
TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
TEST(Malloc, GPUContextMultiThreadMultiStream) {
auto place = platform::CUDAPlace(0);
platform::SetDeviceId(0);
......@@ -166,8 +165,7 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
main_stream_alloc_ptr.reset();
for (int i = 0; i < NUM_STREAMS; ++i) {
auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place));
auto ctx = std::unique_ptr<phi::GPUContext>(new phi::GPUContext(place));
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
......
......@@ -65,7 +65,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
allocation_implicit_stream.reset();
gpuStream_t default_stream =
dynamic_cast<platform::CUDADeviceContext *>(
dynamic_cast<phi::GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
allocation::AllocationPtr allocation_unique =
......@@ -143,7 +143,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
size_t alloc_size = 256;
gpuStream_t default_stream =
dynamic_cast<platform::CUDADeviceContext *>(
dynamic_cast<phi::GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
std::shared_ptr<Allocation> allocation_implicit_stream =
......
......@@ -19,8 +19,8 @@
namespace paddle {
namespace operators {
using framework::Tensor;
using phi::GPUContext;
using platform::ActivationDescriptor;
using platform::CUDADeviceContext;
using platform::TensorDescriptor;
#ifdef PADDLE_WITH_HIP
......@@ -39,12 +39,12 @@ template <typename T>
struct CudnnActivationFunctor {
using ELEMENT_TYPE = T;
#ifdef PADDLE_WITH_HIP
CudnnActivationFunctor(const CUDADeviceContext& ctx,
CudnnActivationFunctor(const phi::GPUContext& ctx,
const T& c,
const miopenActivationMode_t& m)
: ctx_(ctx), coef_(c), mode_(m) {}
#else
CudnnActivationFunctor(const CUDADeviceContext& ctx,
CudnnActivationFunctor(const phi::GPUContext& ctx,
const T& c,
const cudnnActivationMode_t& m)
: ctx_(ctx), coef_(c), mode_(m) {}
......@@ -77,7 +77,7 @@ struct CudnnActivationFunctor {
out->mutable_data<T>(ctx_.GetPlace())));
#endif
}
const CUDADeviceContext& ctx_;
const phi::GPUContext& ctx_;
const T coef_;
#ifdef PADDLE_WITH_HIP
const miopenActivationMode_t mode_;
......@@ -90,12 +90,12 @@ template <typename T>
struct CudnnActivationGradFunctor {
using ELEMENT_TYPE = T;
#ifdef PADDLE_WITH_HIP
CudnnActivationGradFunctor(const CUDADeviceContext& ctx,
CudnnActivationGradFunctor(const phi::GPUContext& ctx,
const T& c,
const miopenActivationMode_t& m)
: ctx_(ctx), coef_(c), mode_(m) {}
#else
CudnnActivationGradFunctor(const CUDADeviceContext& ctx,
CudnnActivationGradFunctor(const phi::GPUContext& ctx,
const T& c,
const cudnnActivationMode_t& m)
: ctx_(ctx), coef_(c), mode_(m) {}
......@@ -141,7 +141,7 @@ struct CudnnActivationGradFunctor {
dx->mutable_data<T>(ctx_.GetPlace())));
#endif
}
const CUDADeviceContext& ctx_;
const phi::GPUContext& ctx_;
const T coef_;
#ifdef PADDLE_WITH_HIP
const miopenActivationMode_t mode_;
......@@ -152,12 +152,12 @@ struct CudnnActivationGradFunctor {
template <typename T>
struct CudnnReluFunctor : public CudnnActivationFunctor<T> {
explicit CudnnReluFunctor(const CUDADeviceContext& ctx)
explicit CudnnReluFunctor(const phi::GPUContext& ctx)
: CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}
};
template <typename T>
struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx)
explicit CudnnReluGradFunctor(const phi::GPUContext& ctx)
: CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}
static constexpr ActBwdOpFwdDeps FwdDeps() {
......@@ -167,12 +167,12 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
template <typename T>
struct CudnnRelu6Functor : public CudnnActivationFunctor<T> {
explicit CudnnRelu6Functor(const CUDADeviceContext& ctx)
explicit CudnnRelu6Functor(const phi::GPUContext& ctx)
: CudnnActivationFunctor<T>(ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {}
};
template <typename T>
struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx)
explicit CudnnRelu6GradFunctor(const phi::GPUContext& ctx)
: CudnnActivationGradFunctor<T>(
ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {}
......@@ -183,12 +183,12 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
template <typename T>
struct CudnnSigmoidFunctor : public CudnnActivationFunctor<T> {
explicit CudnnSigmoidFunctor(const CUDADeviceContext& ctx)
explicit CudnnSigmoidFunctor(const phi::GPUContext& ctx)
: CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}
};
template <typename T>
struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx)
explicit CudnnSigmoidGradFunctor(const phi::GPUContext& ctx)
: CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}
static constexpr ActBwdOpFwdDeps FwdDeps() {
......@@ -198,12 +198,12 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
template <typename T>
struct CudnnTanhFunctor : public CudnnActivationFunctor<T> {
explicit CudnnTanhFunctor(const CUDADeviceContext& ctx)
explicit CudnnTanhFunctor(const phi::GPUContext& ctx)
: CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}
};
template <typename T>
struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx)
explicit CudnnTanhGradFunctor(const phi::GPUContext& ctx)
: CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}
static constexpr ActBwdOpFwdDeps FwdDeps() {
......@@ -221,7 +221,7 @@ class CudnnActivationKernel
framework::Tensor* Out = nullptr;
ExtractActivationTensor(context, &X, &Out);
Out->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<CUDADeviceContext>();
auto& dev_ctx = context.template device_context<phi::GPUContext>();
Functor functor(dev_ctx);
functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivation"), Out);
}
......@@ -242,7 +242,7 @@ class CudnnActivationGradKernel
ExtractActivationGradTensor<Functor::FwdDeps()>(
context, &X, &Out, &dOut, &dX);
dX->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<CUDADeviceContext>();
auto& dev_ctx = context.template device_context<phi::GPUContext>();
Functor functor(dev_ctx);
functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivationGrad"),
GET_DATA_SAFELY(Out, "Input", "Out", "CudnnActivationGrad"),
......
......@@ -194,87 +194,74 @@ using CudaELUGradNegativeAlphaFunctor =
namespace ops = paddle::operators;
namespace plat = paddle::platform;
#define REGISTER_ACTIVATION_CUDA_KERNEL( \
act_type, op_name, functor, grad_functor) \
REGISTER_OP_CUDA_KERNEL( \
act_type, \
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \
ops::functor<float>>, \
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \
ops::functor<double>>, \
ops::ActivationCudaKernel<plat::CUDADeviceContext, \
ops::functor<plat::float16>>, \
ops::ActivationCudaKernel<plat::CUDADeviceContext, \
ops::functor<plat::bfloat16>>); \
REGISTER_OP_CUDA_KERNEL( \
act_type##_grad, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<float>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<double>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
#define REGISTER_ACTIVATION_CUDA_KERNEL( \
act_type, op_name, functor, grad_functor) \
REGISTER_OP_CUDA_KERNEL( \
act_type, \
ops::ActivationCudaKernel<phi::GPUContext, ops::functor<float>>, \
ops::ActivationCudaKernel<phi::GPUContext, ops::functor<double>>, \
ops::ActivationCudaKernel<phi::GPUContext, ops::functor<plat::float16>>, \
ops::ActivationCudaKernel<phi::GPUContext, \
ops::functor<plat::bfloat16>>); \
REGISTER_OP_CUDA_KERNEL( \
act_type##_grad, \
ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::grad_functor<float>>, \
ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::grad_functor<double>>, \
ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::grad_functor<plat::float16>>, \
ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::grad_functor<plat::bfloat16>>);
#define REGISTER_ACTIVATION_CUDA_KERNEL_INT( \
act_type, op_name, functor, grad_functor) \
REGISTER_OP_CUDA_KERNEL( \
act_type, \
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \
ops::functor<float>>, \
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \
ops::functor<double>>, \
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \
ops::functor<int>>, \
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \
ops::functor<int64_t>>, \
ops::ActivationCudaKernel<plat::CUDADeviceContext, \
ops::functor<plat::float16>>, \
ops::ActivationCudaKernel<plat::CUDADeviceContext, \
ops::functor<plat::bfloat16>>); \
REGISTER_OP_CUDA_KERNEL( \
act_type##_grad, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<float>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<double>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<int>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<int64_t>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>, \
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
#define REGISTER_ACTIVATION_CUDA_KERNEL_INT( \
act_type, op_name, functor, grad_functor) \
REGISTER_OP_CUDA_KERNEL( \
act_type, \
ops::ActivationCudaKernel<phi::GPUContext, ops::functor<float>>, \
ops::ActivationCudaKernel<phi::GPUContext, ops::functor<double>>, \
ops::ActivationCudaKernel<phi::GPUContext, ops::functor<int>>, \
ops::ActivationCudaKernel<phi::GPUContext, ops::functor<int64_t>>, \
ops::ActivationCudaKernel<phi::GPUContext, ops::functor<plat::float16>>, \
ops::ActivationCudaKernel<phi::GPUContext, \
ops::functor<plat::bfloat16>>); \
REGISTER_OP_CUDA_KERNEL( \
act_type##_grad, \
ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::grad_functor<float>>, \
ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::grad_functor<double>>, \
ops::ActivationGradCudaKernel<phi::GPUContext, ops::grad_functor<int>>, \
ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::grad_functor<int64_t>>, \
ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::grad_functor<plat::float16>>, \
ops::ActivationGradCudaKernel<phi::GPUContext, \
ops::grad_functor<plat::bfloat16>>);
REGISTER_OP_CUDA_KERNEL(
relu6,
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
ops::CudaRelu6Functor<float>>,
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
ops::CudaRelu6Functor<double>>,
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
ops::CudaRelu6Functor<int>>,
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
ops::CudaRelu6Functor<int64_t>>,
ops::ActivationCudaKernel<plat::CUDADeviceContext,
ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<float>>,
ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<double>>,
ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<int>>,
ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<int64_t>>,
ops::ActivationCudaKernel<phi::GPUContext,
ops::CudaRelu6Functor<plat::float16>>,
ops::ActivationCudaKernel<plat::CUDADeviceContext,
ops::ActivationCudaKernel<phi::GPUContext,
ops::CudaRelu6Functor<plat::bfloat16>>);
REGISTER_OP_CUDA_KERNEL(
relu6_grad,
ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
ops::ActivationGradCudaKernel<phi::GPUContext,
ops::CudaRelu6GradFunctor<float>>,
ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
ops::ActivationGradCudaKernel<phi::GPUContext,
ops::CudaRelu6GradFunctor<double>>,
ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
ops::ActivationGradCudaKernel<phi::GPUContext,
ops::CudaRelu6GradFunctor<int>>,
ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
ops::ActivationGradCudaKernel<phi::GPUContext,
ops::CudaRelu6GradFunctor<int64_t>>,
ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
ops::ActivationGradCudaKernel<phi::GPUContext,
ops::CudaRelu6GradFunctor<plat::float16>>,
ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
ops::ActivationGradCudaKernel<phi::GPUContext,
ops::CudaRelu6GradFunctor<plat::bfloat16>>);
#define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \
......
......@@ -211,7 +211,7 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext;
using CUDA = phi::GPUContext;
REGISTER_OP_CUDA_KERNEL(affine_channel,
ops::AffineChannelCUDAKernel<CUDA, float>,
......
......@@ -35,7 +35,7 @@ class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
platform::errors::InvalidArgument(
"Only support for CUDAPlace.Please switch your context from "
"CPUPlace to CUDAPlace or update your cudnn."));
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto handle = dev_ctx.cudnn_handle();
auto* theta = ctx.Input<Tensor>("Theta");
auto* output = ctx.Output<Tensor>("Output");
......@@ -83,7 +83,7 @@ class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
"support for CUDAPlace. Please switch "
"your context from CPUPlace to "
"CUDAPlace or update your cudnn."));
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto handle = dev_ctx.cudnn_handle();
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
......
......@@ -29,7 +29,7 @@ __global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
}
template <typename T>
struct Linspace<paddle::platform::CUDADeviceContext, T> {
struct Linspace<phi::GPUContext, T> {
void operator()(T start,
T end,
int count,
......@@ -191,7 +191,7 @@ class AffineGridGradOpCUDAKernel : public framework::OpKernel<T> {
w = size_attr[3];
}
T* theta_grad_data = theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
phi::funcs::SetConstant<phi::GPUContext, T>()(
ctx.cuda_device_context(), theta_grad, static_cast<T>(0));
T h_step;
......
......@@ -82,7 +82,7 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* scale = ctx.Input<framework::Tensor>("Scale");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
......@@ -92,8 +92,7 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
framework::Tensor inverse_scale =
ctx.AllocateTmpTensor<MPDType, platform::CUDADeviceContext>({1},
dev_ctx);
ctx.AllocateTmpTensor<MPDType, phi::GPUContext>({1}, dev_ctx);
MPDType* inverse_scale_v = inverse_scale.template data<MPDType>();
InverseAndMemset<MPDType><<<1, 1, 0, dev_ctx.stream()>>>(
......
......@@ -87,11 +87,9 @@ __global__ void FusedFillIf(T** outs,
}
template <typename T, bool IsFoundInfOnCPU>
class UpdateLossScalingFunctor<platform::CUDADeviceContext,
T,
IsFoundInfOnCPU> {
class UpdateLossScalingFunctor<phi::GPUContext, T, IsFoundInfOnCPU> {
public:
void operator()(const platform::CUDADeviceContext& dev_ctx,
void operator()(const phi::GPUContext& dev_ctx,
const bool* found_inf_data,
const T* pre_loss_scaling_data,
const int* good_in_data,
......@@ -134,9 +132,9 @@ class UpdateLossScalingFunctor<platform::CUDADeviceContext,
};
template <typename T>
class LazyZeros<platform::CUDADeviceContext, T> {
class LazyZeros<phi::GPUContext, T> {
public:
void operator()(const platform::CUDADeviceContext& dev_ctx,
void operator()(const phi::GPUContext& dev_ctx,
const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const {
......@@ -204,7 +202,7 @@ class LazyZeros<platform::CUDADeviceContext, T> {
namespace ops = paddle::operators;
namespace plat = paddle::platform;
using GPU = paddle::platform::CUDADeviceContext;
using GPU = phi::GPUContext;
REGISTER_OP_CUDA_KERNEL(update_loss_scaling,
ops::UpdateLossScalingKernel<GPU, float>,
......
......@@ -54,7 +54,7 @@ struct ArrayToLoDFunctor : public std::unary_function<platform::Place, void> {
Apply(static_cast<phi::CPUContext *>(pool.Get(place)));
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Apply(static_cast<platform::CUDADeviceContext *>(pool.Get(place)));
Apply(static_cast<phi::GPUContext *>(pool.Get(place)));
#else
PADDLE_THROW(
platform::errors::Unavailable("Paddle is not compiled with CUDA."));
......
......@@ -82,8 +82,7 @@ class AssignPosCUDAKernel : public framework::OpKernel<T> {
*eff_num_len, platform::CPUPlace(), &cpu_eff_num_len);
cpu_eff_num_len_data = cpu_eff_num_len.data<T>()[0];
}
const auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
const auto& dev_ctx = context.template device_context<phi::GPUContext>();
framework::DDim out_dims = phi::make_ddim({cpu_eff_num_len_data});
auto out_data = out->mutable_data<T>(out_dims, place);
......
......@@ -114,9 +114,9 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> {
T* out_data = output->mutable_data<T>(ctx.GetPlace());
// initialize
auto out_eigen = framework::EigenVector<T>::Flatten(*output);
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
.eigen_device();
auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto& place =
*ctx.template device_context<phi::GPUContext>().eigen_device();
out_eigen.device(place) = out_eigen.constant(static_cast<T>(0));
CBLAS_TRANSPOSE transA = CblasNoTrans;
......@@ -127,7 +127,7 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> {
int64_t strideA = ins_num * in_dim;
int64_t strideB = in_dim * out_dim;
auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
blas.BatchedGEMM(transA,
transB,
ins_num,
......@@ -169,9 +169,9 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
auto in_dim = input_dims[2];
auto out_dim = w_dims[2];
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
.eigen_device();
auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto& place =
*ctx.template device_context<phi::GPUContext>().eigen_device();
// initialize
dx->mutable_data<T>(ctx.GetPlace());
auto dx_eigen = framework::EigenVector<T>::Flatten(*dx);
......@@ -199,7 +199,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
out_dim,
db_data);
auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
T alpha = 1;
T beta = 0;
......@@ -238,7 +238,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
using GPUCtx = paddle::platform::CUDADeviceContext;
using GPUCtx = phi::GPUContext;
REGISTER_OP_CUDA_KERNEL(batch_fc,
ops::BatchFCCUDAKernel<GPUCtx, float>,
ops::BatchFCCUDAKernel<GPUCtx, double>);
......
......@@ -17,9 +17,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
beam_search,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(beam_search,
ops::BeamSearchOpKernel<phi::GPUContext, float>,
ops::BeamSearchOpKernel<phi::GPUContext, double>,
ops::BeamSearchOpKernel<phi::GPUContext, int>,
ops::BeamSearchOpKernel<phi::GPUContext, int64_t>);
......@@ -18,7 +18,7 @@ limitations under the License. */
namespace ops = paddle::operators;
namespace plat = paddle::platform;
using CUDA = paddle::platform::CUDADeviceContext;
using CUDA = phi::GPUContext;
// See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc
REGISTER_OP_CUDA_KERNEL(transfer_dtype,
ops::CastOpKernel<CUDA, float>,
......
......@@ -150,7 +150,7 @@ class CenterLossCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
using GPUCtx = paddle::platform::CUDADeviceContext;
using GPUCtx = phi::GPUContext;
REGISTER_OP_CUDA_KERNEL(center_loss,
ops::CenterLossCUDAKernel<GPUCtx, float>,
ops::CenterLossCUDAKernel<GPUCtx, double>);
......
......@@ -17,8 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
namespace ops = paddle::operators;
using CUDADeviceContext = paddle::platform::CUDADeviceContext;
/* see [Why use single type kernel] */
REGISTER_OP_CUDA_KERNEL(
cinn_instruction_run,
ops::CinnInstructionRunOpKernel<CUDADeviceContext, float>);
ops::CinnInstructionRunOpKernel<phi::GPUContext, float>);
......@@ -18,6 +18,4 @@ limitations under the License. */
/* see [Why use single type kernel] */
REGISTER_OP_CUDA_KERNEL(
cinn_launch,
paddle::operators::CinnLaunchOpKernel<paddle::platform::CUDADeviceContext,
float>);
cinn_launch, paddle::operators::CinnLaunchOpKernel<phi::GPUContext, float>);
......@@ -21,10 +21,8 @@ namespace paddle::operators::details {
#ifdef PADDLE_WITH_CUDA
template <>
void* GetStream<platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx) {
const auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
void* GetStream<phi::GPUContext>(const framework::ExecutionContext& ctx) {
const auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
return dev_ctx.stream();
}
#endif
......
......@@ -40,8 +40,7 @@ void* GetStream(const framework::ExecutionContext& ctx) {
#ifdef PADDLE_WITH_CUDA
template <>
void* GetStream<platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx);
void* GetStream<phi::GPUContext>(const framework::ExecutionContext& ctx);
#endif
} // namespace details
......
......@@ -375,7 +375,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
platform::NCCLCommContext::Instance().Get(rid, ctx.GetPlace());
// use global calculate stream
const auto calcu_stream =
static_cast<platform::CUDADeviceContext*>(
static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(ctx.GetPlace()))
->stream();
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
......@@ -607,6 +607,5 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
class_center_sample,
ops::ClassCenterSampleCUDAKernel<paddle::platform::CUDADeviceContext,
int64_t>,
ops::ClassCenterSampleCUDAKernel<paddle::platform::CUDADeviceContext, int>);
ops::ClassCenterSampleCUDAKernel<phi::GPUContext, int64_t>,
ops::ClassCenterSampleCUDAKernel<phi::GPUContext, int>);
......@@ -519,11 +519,10 @@ REGISTER_OP_CPU_KERNEL(coalesce_tensor,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_OP_CUDA_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
ops::CoalesceTensorOpKernel<phi::GPUContext, plat::float16>,
ops::CoalesceTensorOpKernel<phi::GPUContext, int>,
ops::CoalesceTensorOpKernel<phi::GPUContext, float>,
ops::CoalesceTensorOpKernel<phi::GPUContext, double>);
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
......
......@@ -17,10 +17,9 @@ limitations under the License. */
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
allreduce,
ops::AllReduceOpKernel<plat::CUDADeviceContext, float>,
ops::AllReduceOpKernel<plat::CUDADeviceContext, double>,
ops::AllReduceOpKernel<plat::CUDADeviceContext, int>,
ops::AllReduceOpKernel<plat::CUDADeviceContext, int64_t>,
ops::AllReduceOpKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(allreduce,
ops::AllReduceOpKernel<phi::GPUContext, float>,
ops::AllReduceOpKernel<phi::GPUContext, double>,
ops::AllReduceOpKernel<phi::GPUContext, int>,
ops::AllReduceOpKernel<phi::GPUContext, int64_t>,
ops::AllReduceOpKernel<phi::GPUContext, plat::float16>);
......@@ -38,7 +38,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
platform::errors::PreconditionNotMet(
"AllReduce op can run on gpu place only for now."));
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
......
......@@ -47,7 +47,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
......
......@@ -40,7 +40,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
int rid = ctx.Attr<int>("ring_id");
auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
auto stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
auto stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
ncclRedOp_t nccl_red_type = ncclSum;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
......
......@@ -54,7 +54,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
platform::errors::PreconditionNotMet("Currently, the broadcast op can "
"only be an In-Place operation."));
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto comm = dev_ctx.nccl_comm();
auto stream = dev_ctx.stream();
......
......@@ -68,7 +68,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
......
......@@ -419,7 +419,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
......
......@@ -54,7 +54,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
......
......@@ -90,7 +90,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
T* recv_buff = temp_out.data<T>();
gpuStream_t stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::ncclAllGather(send_buff,
......@@ -113,9 +113,9 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
offset += rows_per_tensor;
}
math::ConcatFunctor<platform::CUDADeviceContext, T> functor;
math::ConcatFunctor<phi::GPUContext, T> functor;
out->mutable_data<T>(out_dims, place);
auto& dev_ctx2 = ctx.template device_context<platform::CUDADeviceContext>();
auto& dev_ctx2 = ctx.template device_context<phi::GPUContext>();
functor(dev_ctx2, inputs, axis, out);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
......
......@@ -91,8 +91,7 @@ class CEmbeddingCUDAKernel : public framework::OpKernel<T> {
auto *ids_t = context.Input<LoDTensor>("Ids");
auto *output_t = context.Output<LoDTensor>("Out");
const auto &dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
const auto &dev_ctx = context.template device_context<phi::GPUContext>();
const int64_t start_idx = context.Attr<int64_t>("start_index");
size_t N = table_t->dims()[0];
size_t D = table_t->dims()[1];
......@@ -142,8 +141,7 @@ template <typename T>
class CEmbeddingGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
const auto &dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
const auto &dev_ctx = context.template device_context<phi::GPUContext>();
const int64_t start_idx = context.Attr<int64_t>("start_index");
auto ids_t = context.Input<LoDTensor>("Ids");
auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
......
......@@ -312,7 +312,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
......
......@@ -55,7 +55,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
......
......@@ -61,7 +61,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
......
......@@ -108,10 +108,10 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
const auto& place = ctx.GetPlace();
const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
// use global calculate stream
const auto stream = static_cast<platform::CUDADeviceContext*>(
const auto stream = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
......@@ -136,8 +136,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
// step 1, obtain logit_max
Tensor logits_max;
logits_max =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* logits_max_buff = logits_max.mutable_data<T>(place);
auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
......@@ -166,7 +165,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
// step 3, obtain predict target
Tensor predicted_logits;
predicted_logits =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
predicted_logits.mutable_data<T>(place);
auto t = framework::EigenVector<T>::Flatten(predicted_logits);
......@@ -217,8 +216,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
// step 5, obtain sum_exp_logits
Tensor sum_exp_logits;
sum_exp_logits =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
......@@ -262,7 +260,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
const int rank = ctx.Attr<int>("rank");
const auto& place = ctx.GetPlace();
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto map = distributed::ProcessGroupMapFromGid::getInstance();
distributed::ProcessGroup* pg = map->get(rid);
......@@ -290,8 +288,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
// step 1, obtain logit_max
Tensor logits_max;
logits_max =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
Eigen::DSizes<int, 1> along_axis(1);
......@@ -314,7 +311,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
// step 3, obtain predict target
Tensor predicted_logits;
predicted_logits =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
predicted_logits.mutable_data<T>(place);
auto t = framework::EigenVector<T>::Flatten(predicted_logits);
......@@ -358,8 +355,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
// step 5, obtain sum_exp_logits
Tensor sum_exp_logits;
sum_exp_logits =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
......@@ -395,8 +391,7 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
context.Output<Tensor>(framework::GradVarName("Logits"));
const Tensor* softmax = context.Input<Tensor>("Softmax");
const int rank = context.Attr<int>("rank");
auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
auto& dev_ctx = context.template device_context<phi::GPUContext>();
if (logit_grad != softmax) {
framework::TensorCopy(
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册